CUTLASS 3.6.0 (#1850)

* v3.6 * update changelog * update readme * fix typo * fixing typos * hopper gemm with weight prefetch --------- Co-authored-by: yuzhai <yuzhai@nvidia.com> Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
2024-10-09 12:33:27 -07:00
parent 0837a2a00a
commit cc3c29a81a
354 changed files with 105943 additions and 8203 deletions
--- a/tools/library/CMakeLists.txt
+++ b/tools/library/CMakeLists.txt
@ -221,7 +221,8 @@ cutlass_add_cutlass_library(

  # files split for parallel compilation
  src/reference/gemm_int4.cu
-  src/reference/gemm_int8_canonical.cu
+  src/reference/gemm_s8_s8_s32.cu
+  src/reference/gemm_u8_u8_s32.cu
  src/reference/gemm_int8_interleaved_32.cu
  src/reference/gemm_int8_interleaved_64.cu
  src/reference/gemm_e4m3a_e4m3out.cu
@ -278,6 +279,7 @@ execute_process(
    --generator-target library
    --architectures "${CUTLASS_NVCC_ARCHS_ENABLED}"
    --kernels "${CUTLASS_LIBRARY_KERNELS}"
+    --instantiation-level "${CUTLASS_LIBRARY_INSTANTIATION_LEVEL}"
    --ignore-kernels "${CUTLASS_LIBRARY_IGNORE_KERNELS}"
    --exclude-kernels "${CUTLASS_LIBRARY_EXCLUDE_KERNELS}"
    --kernel-filter-file "${KERNEL_FILTER_FILE}"
--- a/tools/library/include/cutlass/library/arch_mappings.h
+++ b/tools/library/include/cutlass/library/arch_mappings.h
@ -113,6 +113,12 @@ template <> struct ArchMap<arch::Sm90, arch::OpClassTensorOp> {
  static int const kMax = 90;
 };

+// Arch conditional sparse WGMMA
+template <> struct ArchMap<arch::Sm90, arch::OpClassSparseTensorOp> {
+  static int const kMin = 90;
+  static int const kMax = 90;
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////

 } // namespace library
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@ -103,6 +103,17 @@ public:
    void *device_workspace = nullptr,
    cudaStream_t stream = nullptr) const = 0;

+  // Originally designed for metadata, but should be useful for FP8/6/4 too.
+  virtual Status initialize_with_profiler_workspace(
+    void const *configuration,
+    void *host_workspace,
+    void *device_workspace,
+    uint8_t **profiler_workspace_ptrs,
+    int problem_count,
+    cudaStream_t stream = nullptr) {
+    return Status::kErrorNotSupported;
+  }
+
  virtual Status run(
    void const *arguments,
    void *host_workspace,
@ -290,7 +301,6 @@ struct GemmUniversalArguments {

  // Needed for some 3.x kernels
  int sm_count{0};
-
  library::RasterOrder raster_order{};
  int swizzle_size{1};
 };
--- a/tools/library/src/conv_operation_3x.hpp
+++ b/tools/library/src/conv_operation_3x.hpp
@ -616,7 +616,7 @@ private:
        /* traversal_stride = */ {traversal_stride_h, traversal_stride_w},
        /* dilation         = */ {dilation_h, dilation_w},
                                 num_groups);
-      out_args.mainloop.problem_shape = problem_shape;
+      out_args.problem_shape = problem_shape;

      // ConvProblemShape's constructor sets its shape_C member.
 #if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
@ -788,7 +788,7 @@ private:
        /* traversal_stride = */ {traversal_stride_d, traversal_stride_h, traversal_stride_w},
        /* dilation         = */ {dilation_d, dilation_h, dilation_w},
                                 num_groups);
-      out_args.mainloop.problem_shape = problem_shape;
+      out_args.problem_shape = problem_shape;

      // ConvProblemShape's constructor sets its shape_C member.
 #if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
--- a/tools/library/src/gemm_operation_3x.hpp
+++ b/tools/library/src/gemm_operation_3x.hpp
@ -249,7 +249,6 @@ protected:

    /* Query device SM count to pass onto the kernel as an argument, where needed */
    operator_args.hw_info.sm_count = arguments->sm_count;
-
    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
      operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
    }
@ -282,17 +281,18 @@ public:
      static_cast<GemmUniversalArguments const *>(arguments_ptr);

    OperatorArguments args;
-    auto status = update_arguments_(args, arguments);
-    if (status != Status::kSuccess) {
-      return status;
-    }
-
    // can_implement rules may need access to problem shape
    args.problem_shape = cute::make_shape(
      configuration->problem_size.m(),
      configuration->problem_size.n(),
      configuration->problem_size.k(),
      configuration->batch_count);
+
+    auto status = update_arguments_(args, arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
    return Operator::can_implement(args);
  }

--- a/tools/library/src/reference/gemm_fp_mixed_input.cu
+++ b/tools/library/src/reference/gemm_fp_mixed_input.cu
@ -121,14 +121,14 @@ void initialize_gemm_reference_operations_fp_mixed_input(Manifest &manifest) {
    half_t,
    int8_t,
    half_t,
-    float
+    float 
  >(manifest);

  make_gemm_real_canonical_layouts<
    half_t,
    uint8_t,
    half_t,
-    float
+    float 
  >(manifest);

  // bfloat16_t mixed with 8-bit integer input
--- a/tools/library/src/reference/gemm_fp_other.cu
+++ b/tools/library/src/reference/gemm_fp_other.cu
@ -54,6 +54,14 @@ void initialize_gemm_reference_operations_fp_other(Manifest &manifest) {
    half_t
  >(manifest);

+  make_gemm_real_canonical_layouts<
+    half_t,
+    half_t,
+    float,
+    half_t,
+    half_t
+  >(manifest);
+
  make_gemm_real_canonical_layouts<
    double,
    double,
--- a/tools/library/src/reference/gemm_int_mixed_input.cu
+++ b/tools/library/src/reference/gemm_int_mixed_input.cu
@ -73,7 +73,7 @@ void initialize_gemm_reference_operations_int_mixed_input(Manifest &manifest) {
    int32_t,
    NumericConverterClamp<int32_t, float>
  >(manifest);
-  
+
  make_gemm_real_canonical_layouts<
    int4b_t,
    int8_t,
@ -110,7 +110,7 @@ void initialize_gemm_reference_operations_int_mixed_input(Manifest &manifest) {
    int32_t,
    NumericConverterClamp<int32_t, float>
  >(manifest);
-  
+
  make_gemm_real_canonical_layouts<
    int8_t,
    int4b_t,
--- a/tools/library/src/reference/gemm_s8_s8_s32.cu
+++ b/tools/library/src/reference/gemm_s8_s8_s32.cu
@ -0,0 +1,146 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Instantiates GEMM reference implementations.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "gemm_reference_operation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A/B: s8
+// Acc : s32
+// C/D: some variance
+// Epi Scalar: some variance
+
+// 1. s8_s8_s32_s32_s32 (s32 epi scalar)
+// 2. s8_s8_s32_s32_s32 (f32 epi scalar)
+// 3. s8_s8_s32_s8_s8 (f32 epi scalar)
+// 4. s8_s8_s32_s8_s8 (s32 epi scalar)
+// 5. s8_s8_s32_s32_s8 (f32 epi scalar)
+// 6. s8_s8_s32_f32_f32
+// 7. s8_s8_s32_f16_f16 (f32 epi scalar)
+
+// D = convert( Scalar(alpha) * Scalar( A * B ) + Scalar(beta) * Scalar( C ) )
+// Convert: from epi Scalar dtype to D dtype
+
+void initialize_gemm_reference_operations_s8_s8_s32(Manifest &manifest) {
+  // 1.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    int32_t,                          // ElementC
+    int32_t,                          // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int32_t                           // ElementD
+  >(manifest);
+
+  // 2.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    int32_t,                          // ElementC
+    int32_t,                          // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int32_t                           // ElementD
+  >(manifest);
+
+  // 3.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    int8_t,                           // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int8_t,                           // ElementD
+    NumericConverterClamp<int8_t, float> // From Scalar to D
+  >(manifest);
+
+  // 4.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    int8_t,                           // ElementC
+    int32_t,                          // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int8_t,                           // ElementD
+    NumericConverterClamp<int8_t, int32_t> // From Scalar to D
+  >(manifest);
+
+  // 5.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    int32_t,                          // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int8_t,                           // ElementD
+    NumericConverterClamp<int8_t, float> // From Scalar to D
+  >(manifest);
+
+  // 6.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    float,                            // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    float                             // ElementD
+  >(manifest);
+
+  // 7.
+  make_gemm_real_canonical_layouts<
+    int8_t,                           // ElementA
+    int8_t,                           // ElementB
+    half_t,                           // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    half_t,                           // ElementD
+    NumericConverterClamp<half_t, float> // From Scalar to D
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
--- a/tools/library/src/reference/gemm_int8_canonical.cu
+++ b/tools/library/src/reference/gemm_int8_canonical.cu
@ -45,72 +45,48 @@ namespace library {

 ///////////////////////////////////////////////////////////////////////////////////////////////////

-void initialize_gemm_reference_operations_int8_canonical(Manifest &manifest) {
+// A/B: u8
+// Acc : s32
+// C/D: some variance
+
+// 1. u8_u8_s32_s32_s32 (s32 epi scalar)
+// 2. u8_u8_s32_s32_s32 (f32 epi scalar)
+// 3. u8_8_s32_s8_s8 (f32 epi scalar)
+// 3. u8_8_s32_s8_s8 (s epi scalar)
+
+void initialize_gemm_reference_operations_u8_u8_s32(Manifest &manifest) {
+  // 1.
  make_gemm_real_canonical_layouts<
-    int8_t,
-    int8_t,
-    int32_t,
-    int32_t,
-    int32_t
+    uint8_t,                          // ElementA
+    uint8_t,                          // ElementB
+    int32_t,                          // ElementC
+    int32_t,                          // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int32_t                           // ElementD
  >(manifest);

+  // 2.
  make_gemm_real_canonical_layouts<
-    int8_t,
-    int8_t,
-    int8_t,
-    float,
-    int32_t,
-    int8_t,
-    NumericConverterClamp<int8_t, float>
+    uint8_t,                          // ElementA
+    uint8_t,                          // ElementB
+    int32_t,                          // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int32_t,                          // ElementD
+    NumericConverterClamp<int32_t, float> // From Scalar to D
  >(manifest);

+  // 3.
  make_gemm_real_canonical_layouts<
-    int8_t,
-    int8_t,
-    int32_t,
-    float,
-    int32_t,
-    int32_t,
-    NumericConverterClamp<int32_t, float>
+    uint8_t,                          // ElementA
+    uint8_t,                          // ElementB
+    int8_t,                           // ElementC
+    float,                            // ElementScalar / ElementCompute
+    int32_t,                          // ElementAccumulator
+    int8_t,                           // ElementD
+    NumericConverterClamp<int8_t, float> // From Scalar to D
  >(manifest);

-  make_gemm_real_canonical_layouts<
-    uint8_t,
-    uint8_t,
-    int32_t,
-    int32_t,
-    int32_t
-  >(manifest);
-
-  make_gemm_real_canonical_layouts<
-    uint8_t,
-    uint8_t,
-    int8_t,
-    float,
-    int32_t,
-    int8_t,
-    NumericConverterClamp<int8_t, float>
-  >(manifest);
-
-  make_gemm_real_canonical_layouts<
-    uint8_t,
-    uint8_t,
-    int32_t,
-    float,
-    int32_t,
-    int32_t,
-    NumericConverterClamp<int32_t, float>
-  >(manifest);
-
-  make_gemm_real_canonical_layouts<
-    int8_t,
-    int8_t,
-    int8_t,   
-    int32_t,
-    int32_t,
-    int8_t,
-    NumericConverterClamp<int8_t, int32_t>
-  >(manifest);
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/src/reference/initialize_reference_operations.cu
+++ b/tools/library/src/reference/initialize_reference_operations.cu
@ -46,7 +46,8 @@ namespace library {
 void initialize_gemm_reference_operations_int4(Manifest &manifest);
 void initialize_gemm_reference_operations_int8_interleaved_32(Manifest &manifest);
 void initialize_gemm_reference_operations_int8_interleaved_64(Manifest &manifest);
-void initialize_gemm_reference_operations_int8_canonical(Manifest &manifest);
+void initialize_gemm_reference_operations_s8_s8_s32(Manifest &manifest);
+void initialize_gemm_reference_operations_u8_u8_s32(Manifest &manifest);
 void initialize_gemm_reference_operations_e4m3a_e4m3out(Manifest &manifest);
 void initialize_gemm_reference_operations_e5m2a_e4m3out(Manifest &manifest);
 void initialize_gemm_reference_operations_e4m3a_e5m2out(Manifest &manifest);
@ -72,7 +73,8 @@ void initialize_reference_operations(Manifest &manifest) {

  initialize_gemm_reference_operations_int8_interleaved_32(manifest);
  initialize_gemm_reference_operations_int8_interleaved_64(manifest);
-  initialize_gemm_reference_operations_int8_canonical(manifest);
+  initialize_gemm_reference_operations_s8_s8_s32(manifest);
+  initialize_gemm_reference_operations_u8_u8_s32(manifest);

  initialize_gemm_reference_operations_e4m3a_e4m3out(manifest);
  initialize_gemm_reference_operations_e5m2a_e4m3out(manifest);
@ -85,7 +87,6 @@ void initialize_reference_operations(Manifest &manifest) {
  initialize_gemm_reference_operations_fp32out(manifest);
  initialize_gemm_reference_operations_fp_other(manifest);
  initialize_gemm_reference_operations_fp_mixed_input(manifest);
-
  initialize_gemm_reference_operations_int_mixed_input(manifest);

 }
--- a/tools/library/src/sparse_gemm_operation_3x.hpp
+++ b/tools/library/src/sparse_gemm_operation_3x.hpp
@ -0,0 +1,445 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all GEMM operation kinds in CUTLASS Library.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp" // StructuredSparseCompressor
+#include "cutlass/transform/device/transform_universal_adapter.hpp" // TransformUniversalAdapter
+#include "cutlass/util/packed_stride.hpp"        // make_cute_packed_stride
+#include "gemm_operation_3x.hpp"
+#include "library_internal.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CUDA_CHECK(cuda_error)                                                                 \
+  {                                                                                            \
+    if (cuda_error != cudaSuccess) {                                                           \
+      printf("cudaError %s in %s:%d\n", cudaGetErrorString(cuda_error), __func__, __LINE__ );  \
+      return Status::kInvalid;                                                                 \
+    }                                                                                          \
+  }
+
+namespace cutlass::library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Limitation & Assumptions: 
+// 1. The tensor must be densely packed.  That is, lda is k if the tensor is k-major,
+//    and lda is m if the tensor is m-major.
+// 2. Circular buffer for tensorA and tensorE may have a less count compared to tensorB and others.
+//    This is because we can not get the problem_count information in the get_device_workspace_size().
+//    But I can promise it will use at least 192MB memory if we enable circular buffer.
+template <typename Operator_>
+class SparseGemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  using ElementE = typename CollectiveMainloop::ElementE;
+  using LayoutE = typename CollectiveMainloop::LayoutE;
+  using SparseConfig = typename CollectiveMainloop::SparseConfig;
+  using LayoutATag = decltype(SparseConfig::deduce_layoutA_tag(typename CollectiveMainloop::LayoutA{}));
+  using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                              cute::Shape<int, int, int, int>,
+                              ElementA,
+                              LayoutATag,
+                              SparseConfig>;
+  using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
+                              cute::Shape<int, int, int, int>,
+                              ElementA,
+                              LayoutATag,
+                              SparseConfig,
+                              typename Operator::ArchTag>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+public:
+
+  /// Constructor
+  SparseGemmUniversal3xOperation(char const *name = "unknown_gemm"):
+    GemmOperation3xBase<Operator_>(name, GemmKind::kUniversal) {}
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+      OperatorArguments &operator_args, GemmUniversalConfiguration const *configuration) {
+    // NOTE: GemmUniversalConfiguration does not contain problem shapes or batch strides
+    // Do nothing here and construct kernel arguments in update_arguments_ instead
+    // We also cannot construct TMA descriptors without all the arguments available
+
+    operator_args.mode = configuration->mode;
+    return Status::kSuccess;
+  }
+
+  template<class FusionArgs, class = void>
+  struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, GemmUniversalArguments const &arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template<class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status update_(FusionArgs& fusion_args, GemmUniversalArguments const &arguments) {
+      if (arguments.pointer_mode == ScalarPointerMode::kHost) {
+        fusion_args.alpha = *static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta = *static_cast<ElementCompute const *>(arguments.beta);
+        fusion_args.alpha_ptr = nullptr;
+        fusion_args.beta_ptr = nullptr;
+
+        return Status::kSuccess;
+      }
+      else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
+        fusion_args.alpha = 0;
+        fusion_args.beta = 0;
+        fusion_args.alpha_ptr = static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta_ptr = static_cast<ElementCompute const *>(arguments.beta);
+
+        return Status::kSuccess;
+      }
+      else {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+  };
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+      OperatorArguments &operator_args,
+      GemmUniversalArguments const *arguments,
+      CompressorUtility const& compressor_utility,
+      void* device_a_compressed_ptr = nullptr,
+      void* device_e_ptr = nullptr) {
+    Status status = Status::kSuccess;
+
+    status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread, *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // TODO: type erase Arguments structure in 3.0 GEMM
+    operator_args.problem_shape = cute::make_shape(
+      arguments->problem_size.m(),
+      arguments->problem_size.n(),
+      arguments->problem_size.k(),
+      arguments->batch_count);
+
+    // update arguments
+    operator_args.mainloop.ptr_A = reinterpret_cast<ElementA const *>(device_a_compressed_ptr);
+    operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
+    operator_args.mainloop.ptr_E = reinterpret_cast<ElementE const *>(device_e_ptr);
+    operator_args.epilogue.ptr_C = static_cast<ElementC const *>(arguments->C);
+    operator_args.epilogue.ptr_D = static_cast<ElementD       *>(arguments->D);
+
+    operator_args.mainloop.layout_a = compressor_utility.fill_layoutA_from_compressor();
+    operator_args.mainloop.layout_e = compressor_utility.fill_layoutE_from_compressor();
+    operator_args.mainloop.dB = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideB>(
+        arguments->ldb, arguments->batch_stride_B);
+    operator_args.epilogue.dC = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideC>(
+        arguments->ldc, arguments->batch_stride_C);
+    operator_args.epilogue.dD = operator_args.epilogue.dC;
+
+    /* Query device SM count to pass onto the kernel as an argument, where needed */
+    operator_args.hw_info.sm_count = arguments->sm_count;
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
+      operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
+    }
+
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.raster_order)>) {
+      using Enum_t = decltype(operator_args.scheduler.raster_order);
+      switch (arguments->raster_order) {
+        case RasterOrder::kAlongN:
+          operator_args.scheduler.raster_order = Enum_t::AlongN;
+          break;
+        case RasterOrder::kAlongM:
+          operator_args.scheduler.raster_order = Enum_t::AlongM;
+          break;
+        default:
+          operator_args.scheduler.raster_order = Enum_t::Heuristic;
+      }
+    }
+
+    return status;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  Status can_implement(
+      void const *configuration_ptr, void const *arguments_ptr) const override {
+
+    GemmUniversalConfiguration const *configuration =
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr);
+    GemmUniversalArguments const *arguments =
+      static_cast<GemmUniversalArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+    auto problem_shape_MNKL = cute::make_shape(
+      configuration->problem_size.m(),
+      configuration->problem_size.n(),
+      configuration->problem_size.k(),
+      configuration->batch_count);
+
+    const int M = configuration->problem_size.m();
+    const int N = configuration->problem_size.n();
+    const int K = configuration->problem_size.k();
+    const int L = configuration->batch_count;
+    using StrideA = typename CompressorUtility::StrideA;
+    auto dA = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    compressor_utility.set_problem_size(problem_shape_MNKL, dA);
+    auto status = update_arguments_(args, arguments, compressor_utility);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // can_implement rules may need access to problem shape
+    args.problem_shape = problem_shape_MNKL;
+    return Operator::can_implement(args);
+  }
+
+  /// Gets the host-side workspace
+  uint64_t get_host_workspace_size(void const *) const override {
+    // Memory to hold operator
+    host_op_workspace_size = sizeof(Operator);
+
+    // Memory to hold result of `.structure_sparse_zero_mask_fill()`
+    tensor_a_size          = compressor_utility.get_raw_tensor_A_bytes();
+
+    // NOTE: order here is the order of workspace partition
+    const uint64_t size = host_op_workspace_size + tensor_a_size;
+
+    return size;
+  }
+
+  /// Gets the device-side workspace
+  uint64_t get_device_workspace_size(
+    void const *configuration_ptr,void const *arguments_ptr) const override {
+
+    OperatorArguments args;
+    auto status = update_arguments_(
+      args, static_cast<GemmUniversalArguments const *>(arguments_ptr), compressor_utility);
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    typename Compressor::Arguments compress_arguments {
+      {compressor_utility.M, 0, compressor_utility.K, compressor_utility.L},
+      {/*Empty Not Use*/},
+      {/*Empty Not Use*/} };
+
+    // Size for one iteration
+    // For multi-iteration, will need to multiply result of this function w/ actual problem_count
+    tensor_ac_size           = compressor_utility.get_compressed_tensor_A_bytes();
+    tensor_e_size            = compressor_utility.get_tensor_E_bytes();
+    device_op_workspace_size = Operator::get_workspace_size(args);
+    device_compress_workspace_size = Compressor::get_workspace_size(compress_arguments);
+
+    // NOTE: order here is the order of workspace partition
+    device_per_iter_workspace_size = device_op_workspace_size + device_compress_workspace_size + tensor_ac_size + tensor_e_size;
+
+    return device_per_iter_workspace_size;
+  }
+
+  /// Initializes the workspace
+  Status initialize(
+      void const *configuration_ptr,
+      void *host_workspace,
+      void *device_workspace,
+      cudaStream_t stream = nullptr) const override {
+    return Status::kErrorInternal;
+  }
+
+  Status initialize_with_profiler_workspace(
+      void const *configuration, 
+      void *host_workspace, 
+      void *device_workspace, 
+      uint8_t **profiler_workspaces,
+      int problem_count_from_profiler,
+      cudaStream_t stream = nullptr) {
+
+    // Set problem_count.
+    problem_count = problem_count_from_profiler;
+
+    // * Host Ptr
+    auto* host_op_workspace_ptr       = reinterpret_cast<uint8_t*>(host_workspace);
+    auto* host_a_raw_ptr              = host_op_workspace_ptr + host_op_workspace_size;
+
+    // * Construct Op
+    Operator *op = new (host_op_workspace_ptr) Operator;
+
+    // * Device Full Ptr
+    device_full_ptr = reinterpret_cast<uint8_t*>(device_workspace);
+
+    // * Device Ptr (1st iteration)
+    // Device workspace : | iter1 | iter2 | iter3 | .. | iterx |
+    //            iteri : op_workspace | tensor_ac | tensor_e
+    auto* device_ptr_iter1                = device_full_ptr;
+    auto* device_op_workspace_ptr_iter1         = device_ptr_iter1;
+    auto* device_compressor_workspace_ptr_iter1 = device_op_workspace_ptr_iter1 + device_op_workspace_size;
+    auto* device_a_compressed_ptr_iter1         = device_compressor_workspace_ptr_iter1 + device_compress_workspace_size;
+    auto* device_e_ptr_iter1                    = device_a_compressed_ptr_iter1 + tensor_ac_size;
+
+    // * Device A Raw Ptr
+    auto* device_a_raw_ptr = profiler_workspaces[0];
+
+    // * Random fill 50% of TensorA w/ zero following the structured sparse requirement
+    cudaMemcpy(host_a_raw_ptr, device_a_raw_ptr, tensor_a_size, cudaMemcpyDeviceToHost);
+    compressor_utility.structure_sparse_zero_mask_fill(host_a_raw_ptr, 2000);
+    cudaMemcpy(device_a_raw_ptr, host_a_raw_ptr, tensor_a_size, cudaMemcpyHostToDevice);
+
+    CUDA_CHECK(cudaGetLastError());
+
+    // * Compress DTensorA and get DTensorAC & DTensorE
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    typename Compressor::Arguments arguments{
+        {compressor_utility.M, 0, compressor_utility.K, compressor_utility.L},
+        {device_a_raw_ptr,
+         compressor_utility.dA,
+         device_a_compressed_ptr_iter1,
+         device_e_ptr_iter1},
+        {hw_info}
+    };
+
+    cutlass::Status status {cutlass::Status::kSuccess };
+
+    Compressor compressor_op;
+    status = compressor_op.can_implement(arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = compressor_op.initialize(arguments, device_compressor_workspace_ptr_iter1, stream);
+    if (status != Status::kSuccess) {
+       return status;
+    }
+
+    status = compressor_op.run(stream);
+    if (status != Status::kSuccess) {
+       return status;
+    }
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // * Copy Iter1's DTensorAC DTensorE to each iteration's DTensorAC DTensorE
+    for (int iter_i = 1; iter_i < problem_count; iter_i++) {
+      // * Device AC E Ptr per iteration
+      // Device workspace : | iter1 | iter2 | iter3 | .. | iterx |
+      //            iteri : op_workspace | tensor_ac | tensor_e
+      auto* device_ptr_iteri                = device_full_ptr         + device_per_iter_workspace_size * iter_i;
+      auto* device_op_workspace_ptr         = device_ptr_iteri;
+      auto* device_compressor_workspace_ptr = device_op_workspace_ptr + device_op_workspace_size;
+      auto* device_a_compressed_ptr         = device_compressor_workspace_ptr + device_compress_workspace_size;
+      auto* device_e_ptr                    = device_a_compressed_ptr + tensor_ac_size;
+
+      cudaMemcpy(device_a_compressed_ptr, device_a_compressed_ptr_iter1, tensor_ac_size, cudaMemcpyDeviceToDevice);
+      cudaMemcpy(device_e_ptr, device_e_ptr_iter1, tensor_e_size, cudaMemcpyDeviceToDevice);
+    }
+
+    CUDA_CHECK(cudaGetLastError());
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel
+  Status run(
+      void const *arguments_ptr,
+      void *host_workspace,
+      void *device_workspace = nullptr,
+      cudaStream_t stream = nullptr) const override {
+
+    OperatorArguments operator_args;
+
+    auto* device_ptr_iteri                = device_full_ptr         + device_per_iter_workspace_size * iter_idx;
+    auto* device_op_workspace_ptr         = device_ptr_iteri;
+    auto* device_compressor_workspace_ptr = device_op_workspace_ptr + device_op_workspace_size;
+    auto* device_a_compressed_ptr         = device_compressor_workspace_ptr + device_compress_workspace_size;
+    auto* device_e_ptr                    = device_a_compressed_ptr + tensor_ac_size;
+    iter_idx = (iter_idx + 1) % problem_count;
+
+    Status status = update_arguments_(operator_args, static_cast<GemmUniversalArguments const *>(arguments_ptr), compressor_utility, device_a_compressed_ptr, device_e_ptr );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+    // We need to call initialize() since we have to rebuild TMA desc for every new set of args
+    status = op->run(operator_args, device_op_workspace_ptr, stream);
+    return status;
+  }
+
+private:
+  // Variables that must change in the const functions.
+  mutable CompressorUtility compressor_utility;
+  mutable int problem_count = 1;
+  mutable int iter_idx = 0;
+
+  uint8_t* device_full_ptr = nullptr;
+
+  mutable uint64_t tensor_ac_size = 0;
+  mutable uint64_t tensor_e_size = 0;
+  mutable uint64_t tensor_a_size = 0;
+  mutable uint64_t host_op_workspace_size = 0;
+  mutable uint64_t device_compress_workspace_size = 0;
+  mutable uint64_t device_op_workspace_size = 0;
+  mutable uint64_t device_per_iter_workspace_size = 0;
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::library
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
--- a/tools/library/src/util.cu
+++ b/tools/library/src/util.cu
@ -756,6 +756,7 @@ OpcodeClassID_enumerants[] = {
  {"tensorop", "<tensorop>", OpcodeClassID::kTensorOp},
  {"wmmatensorop", "<wmmatensorop>", OpcodeClassID::kWmmaTensorOp},
  {"wmma", "<wmma>", OpcodeClassID::kWmmaTensorOp},
+  {"sptensorop", "<sptensorop>", OpcodeClassID::kSparseTensorOp}
 };

 /// Converts a OpcodeClassID enumerant to a string
--- a/tools/profiler/include/cutlass/profiler/cublas_helpers.h
+++ b/tools/profiler/include/cutlass/profiler/cublas_helpers.h
@ -36,6 +36,7 @@

 #if CUTLASS_ENABLE_CUBLAS
 #include <cublas_v2.h>
+#include <cublasLt.h>

 #include "cutlass/cutlass.h"
 #include "cutlass/library/library.h"
@ -90,25 +91,48 @@ Status cublas_satisfies(library::SymmDescription const &desc);
 /// Additionally, it provides implicit cast from CublasCreate's object to cublasHandle_t's object
 class CublasCreate {
 private:
-	cublasHandle_t handle;
-	cublasStatus_t status;
+  cublasHandle_t handle;
+  cublasStatus_t status;

 public:
-	CublasCreate() {
-		status = cublasCreate(&handle);
-	}
+  CublasCreate() {
+    status = cublasCreate(&handle);
+  }

-	~CublasCreate() {
-		cublasDestroy(handle);
-	}
+  ~CublasCreate() {
+    cublasDestroy(handle);
+  }

-    /// Implicit cast CublasCreate object to cublasHandle_t
-    operator cublasHandle_t() const { return handle; }
+  /// Implicit cast CublasCreate object to cublasHandle_t
+  operator cublasHandle_t() const { return handle; }

-    /// returns cublasStatus_t for handle creation
-    cublasStatus_t get_cublas_create_status() { return status; }
+  /// returns cublasStatus_t for handle creation
+  cublasStatus_t get_cublas_create_status() { return status; }
 };

+/// This is a helper class to create cublasLtHandle_t automatically on CublasLtCreate object creation and 
+/// to destroy cublasLtHandle_t on CublasLtCreate object destruction. 
+/// Additionally, it provides implicit cast from CublasLtCreate's object to cublasLtHandle_t's object
+class CublasLtCreate {
+private:
+  cublasLtHandle_t handle;
+  cublasStatus_t status;
+
+public:
+  CublasLtCreate() {
+    status = cublasLtCreate(&handle);
+  }
+
+  ~CublasLtCreate() {
+    cublasLtDestroy(handle);
+  }
+
+  /// Implicit cast CublasLtCreate object to cublasLtHandle_t
+  operator cublasLtHandle_t() const { return handle; }
+
+  /// returns cublasLtStatus_t for handle creation
+  cublasStatus_t get_cublaslt_create_status() { return status; }
+};
 /////////////////////////////////////////////////////////////////////////////////////////////////

 namespace detail {
@ -226,6 +250,80 @@ struct cublasGemmExDispatcher {
  cublasStatus_t operator()(cublasHandle_t handle);
 };

+/// Dispatcher to cublaslt kernels 
+//
+struct cublasLtGemmExDispatcher {
+
+  //
+  // Data members
+  //
+  library::GemmDescription const &op_desc;
+  library::GemmUniversalConfiguration configuration;
+  library::GemmUniversalArguments arguments;
+
+  // cublas-specific data structures to fill cublas API call arguments
+  cublasOperation_t trans_A;
+  cublasOperation_t trans_B;
+  cudaDataType_t data_type_A;
+  cudaDataType_t data_type_B;
+  cudaDataType_t data_type_C;
+  cudaDataType_t compute_data_type = CUDA_R_32F;
+
+  //cublasLt-specific data structures
+  cublasLtMatmulDesc_t operationDesc = NULL;
+  cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
+  cublasLtMatmulPreference_t preference = NULL;
+  
+  //is set by call to get_cublaslt_algo()
+  cublasLtMatmulHeuristicResult_t heuristicResult_;
+  void *workspace = nullptr;
+
+  Status status;
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  cublasComputeType_t compute_type;
+#endif
+
+  //
+  // Methods
+  //
+
+  cublasLtGemmExDispatcher( 
+    library::GemmDescription const &op_desc,
+    library::GemmUniversalConfiguration configuration_,
+    library::GemmUniversalArguments arguments_
+  );
+
+  /// Initialize the cublasLt variables
+  void initialize_cublaslt();
+  
+
+  /// Runs auto-tuning for the cublas heuristics
+  bool get_cublaslt_algo(cublasLtHandle_t handle,
+    AlgorithmMode algorithm_mode 
+    ); 
+
+  /// Executes GEMM using these arguments
+  cublasStatus_t operator()(cublasLtHandle_t handle);
+
+  ~cublasLtGemmExDispatcher(){
+
+    // descriptors are no longer needed as all GPU work was already enqueued
+    if (preference) cublasLtMatmulPreferenceDestroy(preference);
+    if (Ddesc) cublasLtMatrixLayoutDestroy(Ddesc);
+    if (Cdesc) cublasLtMatrixLayoutDestroy(Cdesc);
+    if (Bdesc) cublasLtMatrixLayoutDestroy(Bdesc);
+    if (Adesc) cublasLtMatrixLayoutDestroy(Adesc);
+    if (operationDesc) cublasLtMatmulDescDestroy(operationDesc);
+
+    if (workspace) {
+      cudaFree(workspace);
+    }
+
+  } 
+
+};
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////

 /// Dispatcher to cublas rank k update kernels 
--- a/tools/profiler/include/cutlass/profiler/cutlass_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/cutlass_profiler.h
@ -48,7 +48,7 @@ namespace profiler {

 /////////////////////////////////////////////////////////////////////////////////////////////////

-/// CUTLASS Profiler application 
+/// CUTLASS Profiler application
 class CutlassProfiler {
 private:

@ -66,13 +66,10 @@ private:

  /// Prints usage
  void print_usage_(std::ostream &);
-  
+
  /// Prints usage
  void print_options_(std::ostream &);

-  /// Initializes the device
-  void initialize_device_();
-
  /// Enumerates all operations
  void enumerate_();

--- a/tools/profiler/include/cutlass/profiler/device_allocation.h
+++ b/tools/profiler/include/cutlass/profiler/device_allocation.h
@ -81,6 +81,9 @@ private:
  /// Buffer holding TensorRef instance to recently allocated memory
  std::vector<uint8_t> tensor_ref_buffer_;

+  /// The device ID where the allocation is made
+  int device_;
+
 public:
  //
  // Static member functions
@ -91,7 +94,7 @@ public:

  /// Returns the stride of a packed layout
  static std::vector<int64_t> get_packed_layout(
-    library::LayoutTypeID layout_id, 
+    library::LayoutTypeID layout_id,
    std::vector<int> const &extent);

  /// returns the capacity needed
@ -103,16 +106,16 @@ public:

  /// Returns true if two blocks have exactly the same value
  static bool block_compare_equal(
-    library::NumericTypeID numeric_type, 
-    void const *ptr_A, 
-    void const *ptr_B, 
+    library::NumericTypeID numeric_type,
+    void const *ptr_A,
+    void const *ptr_B,
    size_t capacity);

  /// Returns true if two blocks have approximately the same value
  static bool block_compare_relatively_equal(
-    library::NumericTypeID numeric_type, 
-    void const *ptr_A, 
-    void const *ptr_B, 
+    library::NumericTypeID numeric_type,
+    void const *ptr_A,
+    void const *ptr_B,
    size_t capacity,
    double epsilon,
    double nonzero_floor);
@ -123,15 +126,19 @@ public:
  //

  DeviceAllocation();
-  
-  DeviceAllocation(library::NumericTypeID type, size_t capacity);
-  
+
  DeviceAllocation(
-    library::NumericTypeID type, 
-    library::LayoutTypeID layout_id, 
-    std::vector<int> const &extent, 
+    library::NumericTypeID type,
+    size_t capacity,
+    int device = -1);
+
+  DeviceAllocation(
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
    std::vector<int64_t> const &stride = std::vector<int64_t>(),
-    int batch_count = 1);
+    int batch_count = 1,
+    int device = -1);

  ~DeviceAllocation();

@ -142,9 +149,9 @@ public:

  /// Allocates memory for a given layout and tensor
  DeviceAllocation &reset(
-    library::NumericTypeID type, 
-    library::LayoutTypeID layout_id, 
-    std::vector<int> const &extent, 
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
    std::vector<int64_t> const &stride = std::vector<int64_t>(),
    int batch_count = 1);

@ -157,7 +164,7 @@ public:

  /// Data type of contained elements
  library::NumericTypeID type() const;
-  
+
  /// Pointer to start of device memory allocation
  void *data() const;

@ -184,7 +191,7 @@ public:

  /// Capacity of allocation in number of elements
  size_t capacity() const;
-  
+
  /// Capacity of allocation in bytes
  size_t bytes() const;

@ -205,7 +212,7 @@ public:

  /// Initializes a host allocation to a random distribution using std::cout
  void initialize_random_sparsemeta_host(int seed, int MetaSizeInBits);
-  
+
  /// Uniformly fills a tensor with a value when provided o.w. zero
  void fill_device(double value);

@ -221,8 +228,12 @@ public:
  /// Copies from an equivalent-sized tensor in device memory
  void copy_to_host(void *ptr);

-  /// Writes a tensor to csv 
+  /// Writes a tensor to csv
  void write_tensor_csv(std::ostream &out);
+
+private:
+  /// A wrapper that sets the device, performs malloc, and sets back
+  cudaError_t malloc(void** ptr, size_t size);
 };

 using DeviceAllocationList = std::list<DeviceAllocation>;
--- a/tools/profiler/include/cutlass/profiler/device_context.h
+++ b/tools/profiler/include/cutlass/profiler/device_context.h
@ -29,7 +29,7 @@
 *
 **************************************************************************************************/
 /* \file
-   \brief 
+   \brief
 */

 #pragma once
@ -68,46 +68,52 @@ private:

  /// Non-owning set of named allocations
  AllocationMap allocations_;
-  
+
 public:

  /// Allocates memory of a given type, capacity (elements), and name
  DeviceAllocation *allocate_block(
+    Options const &options,
    std::string const &name,
-    library::NumericTypeID type, 
-    size_t capacity);
-
-  /// Allocates memory of a given type, capacity (elements), and name
-  DeviceAllocation *allocate_tensor(
-    std::string const &name,
-    library::NumericTypeID type, 
-    library::LayoutTypeID layout_id, 
-    std::vector<int> const &extent, 
-    std::vector<int64_t> const &stride = std::vector<int64_t>(),
-    int batch_count = 1);
+    library::NumericTypeID type,
+    size_t capacity,
+    size_t device_index);

  /// Allocates memory of a given type, capacity (elements), and name
  DeviceAllocation *allocate_tensor(
    Options const &options,
    std::string const &name,
-    library::NumericTypeID type, 
-    library::LayoutTypeID layout_id, 
-    std::vector<int> const &extent, 
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
    std::vector<int64_t> const &stride,
    int batch_count,
-    int seed_shift = 0);
+    size_t device_index);

-  /// Allocates memory for sparse meta data 
-  DeviceAllocation *allocate_sparsemeta_tensor(
+  /// Allocates memory of a given type, capacity (elements), and name
+  DeviceAllocation *allocate_and_initialize_tensor(
    Options const &options,
    std::string const &name,
-    library::NumericTypeID type, 
-    library::LayoutTypeID layout_id, 
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
+    std::vector<int64_t> const &stride,
+    int batch_count,
+    int seed_shift,
+    size_t device_index);
+
+  /// Allocates memory for sparse meta data
+  DeviceAllocation *allocate_and_initialize_sparsemeta_tensor(
+    Options const &options,
+    std::string const &name,
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
    library::NumericTypeID type_a,
-    std::vector<int> const &extent, 
+    std::vector<int> const &extent,
    std::vector<int64_t> const &stride,
    int batch_count,
-    int seed_shift = 0);
+    int seed_shift,
+    size_t device_index);

  /// Clears named allocations (but does not necessarily free memory)
  void clear();
--- a/tools/profiler/include/cutlass/profiler/options.h
+++ b/tools/profiler/include/cutlass/profiler/options.h
@ -82,12 +82,16 @@ public:
  struct Device {

    /// Device ID
-    int device;
+    std::vector<int> devices;
+
+    /// Number of total devices
+    /// This is not set by the user, it is set by automatically
+    int num_devices;

    /// CUDA Device properties
-    cudaDeviceProp properties;
+    std::vector<cudaDeviceProp> properties;

-    /// Total memory allocation on device
+    /// Total memory allocation on each device
    size_t maximum_capacity;

    //
@ -100,8 +104,11 @@ public:
    void print_options(std::ostream &out, int indent = 0) const;
    void print_device_info(std::ostream &out) const;

-    /// Returns the compute capability of the listed device (e.g. 61, 60, 70, 75)
-    int compute_capability() const;
+    /// Returns the device ID from a device index
+    int device_id(size_t device_index) const;
+
+    /// Returns the compute capability of the listed devices (e.g. 61, 60, 70, 75)
+    int compute_capability(int device_index) const;
  };

  /// Options related to initializing input tensors
@ -129,7 +136,7 @@ public:
    //

    explicit Initialization(CommandLine const &cmdline);
-    
+
    void print_usage(std::ostream &out) const;
    void print_options(std::ostream &out, int indent = 0) const;

@ -171,13 +178,13 @@ public:
    //

    explicit Verification(CommandLine const &cmdline);
-  
+
    void print_usage(std::ostream &out) const;
    void print_options(std::ostream &out, int indent = 0) const;

    /// Returns true if a provider is enabled
    bool provider_enabled(library::Provider provider) const;
-    
+
    /// Returns the index of a provider if its enabled
    size_t index(library::Provider provider) const;
  };
@ -225,7 +232,7 @@ public:
    /// Returns the index of a provider if its enabled
    size_t index(library::Provider provider) const;
  };
-  
+
  /// Options related to reporting
  struct Report {

@ -260,7 +267,7 @@ public:
    //

    explicit Report(CommandLine const &cmdline);
-    
+
    void print_usage(std::ostream &out) const;
    void print_options(std::ostream &out, int indent = 0) const;
  };
@ -282,7 +289,7 @@ public:
    //

    explicit About(CommandLine const &cmdline);
-    
+
    void print_usage(std::ostream &out) const;
    void print_options(std::ostream &out, int indent = 0) const;

@ -303,7 +310,7 @@ public:

  /// Vector of operation name substrings
  std::vector<std::string> operation_names;
-  
+
  /// Vector of operation name substrings
  std::vector<std::string> excluded_operation_names;

--- a/tools/profiler/src/conv2d_operation_profiler.cu
+++ b/tools/profiler/src/conv2d_operation_profiler.cu
@ -51,10 +51,10 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////

 /// Ctor
-Conv2dOperationProfiler::Conv2dOperationProfiler(Options const &options): 
+Conv2dOperationProfiler::Conv2dOperationProfiler(Options const &options):
  OperationProfiler(
    options,
-    library::OperationKind::kConv2d, 
+    library::OperationKind::kConv2d,
    {
      {ArgumentTypeID::kEnumerated, {"conv_kind"}, "Convolutional operator (fprop, dgrad, wgrad)"},
      {ArgumentTypeID::kInteger, {"n", "input_n"}, "Input N dimension of the Conv2d problem space"},
@ -165,13 +165,13 @@ int64_t Conv2dOperationProfiler::Conv2dProblem::flops(

  int64_t flops_mainloop_ = int64_t(mnk.m()) * mnk.n() * mnk.k() * 2;
  int64_t flops_epilogue_ = int64_t(mnk.m()) * int64_t(mnk.n()) * 2;
-  
+
  // Adjust mainloop flop for dgrad strided
  if (operation_desc.conv_kind == library::ConvKind::kDgrad) {
    flops_mainloop_ = flops_mainloop_ / (stride_h * stride_w);
  }
  int64_t flops_total_ = flops_mainloop_ + flops_epilogue_;
-  
+
  //complex-valued support
  switch (operation_desc.tile_description.math_instruction.math_operation) {
  case library::MathOperationID::kMultiplyAddComplex:
@ -188,14 +188,14 @@ int64_t Conv2dOperationProfiler::Conv2dProblem::flops(

 /// Extracts the problem dimensions
 Status Conv2dOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

-  library::ConvDescription const &operation_desc = 
+  library::ConvDescription const &operation_desc =
    static_cast<library::ConvDescription const &>(operation->description());

  if (!arg_as_int(problem_.n, "n", problem_space, problem)) {
@ -207,7 +207,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
    // default value
    problem_.h = 16;
  }
-  
+
  if (!arg_as_int(problem_.w, "w", problem_space, problem)) {
    // default value
    problem_.w = 16;
@ -227,7 +227,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
    // default value
    problem_.r = 3;
  }
-  
+
  if (!arg_as_int(problem_.s, "s", problem_space, problem)) {
    // default value
    problem_.s = 3;
@ -280,14 +280,14 @@ Status Conv2dOperationProfiler::initialize_configuration(
  // cutlass profiler sets p and q which are cuDNN compliant.                           //
  //                                                                                    //
  ////////////////////////////////////////////////////////////////////////////////////////
-  // set convolution output p 
+  // set convolution output p
  if (!arg_as_int(problem_.p, "p", problem_space, problem)) {
    // default value (set using cudnn formula for output height, when p is not provided)
    problem_.p = (
-                    problem_.h + 
-                    2 * problem_.pad_h - 
+                    problem_.h +
+                    2 * problem_.pad_h -
                    ((problem_.r - 1) * problem_.dilation_h + 1)
-                 ) / (problem_.stride_h) 
+                 ) / (problem_.stride_h)
                + 1;
  }

@ -295,10 +295,10 @@ Status Conv2dOperationProfiler::initialize_configuration(
  if (!arg_as_int(problem_.q, "q", problem_space, problem)) {
    // default value (set using cudnn formula for output width, when q is not provided)
    problem_.q = (
-                    problem_.w + 
-                    2 * problem_.pad_w - 
+                    problem_.w +
+                    2 * problem_.pad_w -
                    ((problem_.s - 1) * problem_.dilation_w + 1)
-                 ) / (problem_.stride_w) 
+                 ) / (problem_.stride_w)
                + 1;
  }
  /////////////////////////////////////////////////////////////////////////////////////////
@ -313,7 +313,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
    // default value
    problem_.split_k_slices = 1;
  }
-  
+
  if (!arg_as_ConvModeID(problem_.conv_mode, "conv_mode", problem_space, problem)) {
    // default value
    problem_.conv_mode = library::ConvModeID::kCrossCorrelation;
@ -345,24 +345,24 @@ Status Conv2dOperationProfiler::initialize_configuration(
  }

  if (!arg_as_scalar(
-    problem_.alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    problem_.alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
    problem)) {

    if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) {
      return Status::kErrorInternal;
    }
  }
-  
+
  if (!arg_as_scalar(
-    problem_.beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    problem_.beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
    problem)) {
-    
+
    if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) {
      return Status::kErrorInternal;
    }
@ -389,7 +389,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
                                                int(problem_.split_k_slices),
                                                int(problem_.groups)
                                              );
-  
+
  conv_workspace_.configuration.split_k_mode = static_cast<conv::SplitKMode>(static_cast<int>(problem_.split_k_mode));

  conv_workspace_.set_stride_vector(
@ -420,7 +420,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
 /// Initializes the performance result
 void Conv2dOperationProfiler::initialize_result_(
  PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
  library::ConvDescription const &operation_desc,
  ProblemSpace const &problem_space) {

@ -432,15 +432,15 @@ void Conv2dOperationProfiler::initialize_result_(
  result.arguments.resize(problem_space.rank());

  set_argument(result, "Activation", problem_space,
-    std::string(library::to_string(operation_desc.activation().element)) 
+    std::string(library::to_string(operation_desc.activation().element))
    + ":" + library::to_string(operation_desc.activation().layout));

  set_argument(result, "Filter", problem_space,
-    std::string(library::to_string(operation_desc.filter().element)) 
+    std::string(library::to_string(operation_desc.filter().element))
    + ":" + library::to_string(operation_desc.filter().layout));

  set_argument(result, "Output", problem_space,
-    std::string(library::to_string(operation_desc.output().element)) 
+    std::string(library::to_string(operation_desc.output().element))
    + ":" + library::to_string(operation_desc.output().layout));

  set_argument(result, "conv_kind", problem_space, library::to_string(operation_desc.conv_kind));
@ -455,7 +455,7 @@ void Conv2dOperationProfiler::initialize_result_(
  set_argument(result, "k", problem_space, problem_.k);
  set_argument(result, "r", problem_space, problem_.r);
  set_argument(result, "s", problem_space, problem_.s);
-  
+
  set_argument(result, "p", problem_space, problem_.p);
  set_argument(result, "q", problem_space, problem_.q);

@ -470,11 +470,11 @@ void Conv2dOperationProfiler::initialize_result_(
  set_argument(result, "dilation_h", problem_space, problem_.dilation_h);
  set_argument(result, "dilation_w", problem_space, problem_.dilation_w);

-  set_argument(result, "split_k_mode", problem_space, 
+  set_argument(result, "split_k_mode", problem_space,
    std::string(library::to_string(problem_.split_k_mode)));
  set_argument(result, "split_k_slices", problem_space, problem_.split_k_slices);

-  set_argument(result, "conv_mode", problem_space, 
+  set_argument(result, "conv_mode", problem_space,
    std::string(library::to_string(problem_.conv_mode)));

  set_argument(result, "alpha", problem_space,
@ -483,19 +483,19 @@ void Conv2dOperationProfiler::initialize_result_(
  set_argument(result, "beta", problem_space,
    library::lexical_cast(problem_.beta, operation_desc.element_epilogue));

-  set_argument(result, "eq_gemm_provider", problem_space, 
+  set_argument(result, "eq_gemm_provider", problem_space,
    std::string(library::to_string(problem_.eq_gemm_provider)));

  OperationProfiler::initialize_result_(result, operation_desc, problem_space);

  // Bytes of activation, filter, and output tensors
-  int64_t activation_bytes = int64_t(library::sizeof_bits(operation_desc.activation().element) / 8) * 
+  int64_t activation_bytes = int64_t(library::sizeof_bits(operation_desc.activation().element) / 8) *
    conv_workspace_.configuration.problem_size.activation_size();

-  int64_t filter_bytes = int64_t(library::sizeof_bits(operation_desc.filter().element) / 8) * 
+  int64_t filter_bytes = int64_t(library::sizeof_bits(operation_desc.filter().element) / 8) *
    conv_workspace_.configuration.problem_size.filter_size();

-  int64_t output_bytes = int64_t(library::sizeof_bits(operation_desc.output().element) / 8) * 
+  int64_t output_bytes = int64_t(library::sizeof_bits(operation_desc.output().element) / 8) *
    conv_workspace_.configuration.problem_size.output_size();

  // Bytes of activation, filter, and output tensors
@ -511,14 +511,14 @@ void Conv2dOperationProfiler::initialize_result_(

 /// Initialize reduction problem dimensions and library::Operation
 bool Conv2dOperationProfiler::initialize_reduction_configuration_(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

-  library::ConvDescription const &conv_desc = 
+  library::ConvDescription const &conv_desc =
    static_cast<library::ConvDescription const &>(operation->description());

  library::ConvKind const &conv_kind = conv_desc.conv_kind;
@ -545,14 +545,14 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
  conv_workspace_.reduction_configuration.ldd =
      conv_workspace_.configuration.stride_c[tensor_c_stride_idx];

-  // find reduction operation 
+  // find reduction operation
  library::ReductionFunctionalKey reduction_key(
    library::Provider::kCUTLASS,
-    conv_desc.tile_description.math_instruction.element_accumulator,  // element workspace 
+    conv_desc.tile_description.math_instruction.element_accumulator,  // element workspace
    conv_desc.tile_description.math_instruction.element_accumulator,  // element accumulator
    conv_desc.C.element,                                              // element output
    conv_desc.element_epilogue                                        // element compute
-  ); 
+  );

 #if 0// debug print to check which reduction instance is selected
    std::cout << reduction_key << "\n";
@ -562,7 +562,7 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
  if(reduction_it == Singleton::get().operation_table.reduction_operations.end()) {

    return false;
-  }    
+  }

  // initialize reduction operation required for parallel split-k conv2d operator
  reduction_op_ = reduction_it->second;
@ -574,13 +574,24 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(

 /// Initializes workspace
 Status Conv2dOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
  // initialize conv2d underlying operation to handle parallel reduction
  library::Operation const* underlying_operation = operation;

@ -590,15 +601,15 @@ Status Conv2dOperationProfiler::initialize_workspace(
    }
  }

-  library::ConvDescription const &operation_desc = 
+  library::ConvDescription const &operation_desc =
    static_cast<library::ConvDescription const &>(underlying_operation->description());

  // Compute the number of copies of the problem to avoid L2 camping.
  if (!options.profiling.workspace_count) {
    int64_t bytes = problem_.bytes(operation_desc);
-    if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
+    if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
      conv_workspace_.problem_count =
-        1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
+        1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
    }
    else {
      conv_workspace_.problem_count = 1;
@ -611,7 +622,7 @@ Status Conv2dOperationProfiler::initialize_workspace(

  if (options.execution_mode != ExecutionMode::kDryRun) {
    int seed_shift = 0;
-    conv_workspace_.A = device_context.allocate_tensor(
+    conv_workspace_.A = device_context.allocate_and_initialize_tensor(
      options,
      "A",
      operation_desc.A.element,
@ -619,10 +630,11 @@ Status Conv2dOperationProfiler::initialize_workspace(
      problem_.extent_a(operation_desc.conv_kind),
      conv_workspace_.configuration.stride_a,
      conv_workspace_.problem_count,
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

-    conv_workspace_.B = device_context.allocate_tensor(
+    conv_workspace_.B = device_context.allocate_and_initialize_tensor(
      options,
      "B",
      operation_desc.B.element,
@ -630,12 +642,13 @@ Status Conv2dOperationProfiler::initialize_workspace(
      problem_.extent_b(operation_desc.conv_kind),
      conv_workspace_.configuration.stride_b,
      conv_workspace_.problem_count,
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

    if(problem_.groups == problem_.c && problem_.groups == problem_.k){
      // Depthwise direct conv kernel needs reorder the filter.
-      conv_workspace_.reordered_B = device_context.allocate_tensor(
+      conv_workspace_.reordered_B = device_context.allocate_and_initialize_tensor(
        options,
        "B",
        operation_desc.B.element,
@ -643,11 +656,12 @@ Status Conv2dOperationProfiler::initialize_workspace(
        problem_.extent_b(operation_desc.conv_kind),
        conv_workspace_.configuration.stride_b,
        conv_workspace_.problem_count,
-        seed_shift++
+        seed_shift++,
+        0 // device_index
      );
    }

-    conv_workspace_.C = device_context.allocate_tensor(
+    conv_workspace_.C = device_context.allocate_and_initialize_tensor(
      options,
      "C",
      operation_desc.C.element,
@ -655,25 +669,30 @@ Status Conv2dOperationProfiler::initialize_workspace(
      problem_.extent_c(operation_desc.conv_kind),
      conv_workspace_.configuration.stride_c,
      conv_workspace_.problem_count,
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

    conv_workspace_.Computed = device_context.allocate_tensor(
+      options,
      "D",
      operation_desc.C.element,
      operation_desc.C.layout,
      problem_.extent_c(operation_desc.conv_kind),
      conv_workspace_.configuration.stride_c,
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      0 // device_index
    );

    conv_workspace_.Reference = device_context.allocate_tensor(
+      options,
      "Reference",
      operation_desc.C.element,
      operation_desc.C.layout,
      problem_.extent_c(operation_desc.conv_kind),
      conv_workspace_.configuration.stride_c,
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      0 // device_index
    );
  }

@ -706,10 +725,10 @@ Status Conv2dOperationProfiler::initialize_workspace(
        conv_workspace_.reduction_host_workspace.resize(workspace_size, 0);

        status = reduction_op_->initialize(
-          &conv_workspace_.reduction_configuration, 
-          conv_workspace_.reduction_host_workspace.data(), 
+          &conv_workspace_.reduction_configuration,
+          conv_workspace_.reduction_host_workspace.data(),
          nullptr);
-        
+
        if (status != Status::kSuccess) {
          return status;
        }
@ -736,7 +755,7 @@ Status Conv2dOperationProfiler::initialize_workspace(

 /// Verifies CUTLASS against references
 bool Conv2dOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -769,7 +788,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
  }

  conv_workspace_.Computed->copy_from_device(conv_workspace_.C->data());
-  
+
  if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
    // update library::ConvArguments for parallel split-k reduction
    conv_workspace_.arguments.D = conv_workspace_.device_workspace.data();
@ -799,9 +818,9 @@ bool Conv2dOperationProfiler::verify_cutlass(
  }

 #if 0
-  std::cout << "profiling         : " << std::endl 
-            << "conv2d            : " << operation->description().name << std::endl 
-            << "underlying conv2d : " << underlying_operation->description().name << std::endl 
+  std::cout << "profiling         : " << std::endl
+            << "conv2d            : " << operation->description().name << std::endl
+            << "underlying conv2d : " << underlying_operation->description().name << std::endl
            << "reduction         : " << reduction_op_->description().name << std::endl;
 #endif

@ -818,7 +837,7 @@ bool Conv2dOperationProfiler::verify_cutlass(

  // Run parallel reduction kernel for parallel split_k_mode
  if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
-    
+
    results_.back().status = reduction_op_->run(
      &conv_workspace_.reduction_arguments,
      conv_workspace_.reduction_host_workspace.data(),
@ -840,7 +859,7 @@ bool Conv2dOperationProfiler::verify_cutlass(

  // CUTLASS op ran the but not yet verified against any verification provider
  results_.back().disposition = Disposition::kNotVerified;
-  
+
  //
  // Run verification providers
  //
@ -856,7 +875,7 @@ bool Conv2dOperationProfiler::verify_cutlass(

      Status status = cudnn_satisfies(conv_desc, conv_workspace_.configuration);

-      // Initialize reference data to the source data 
+      // Initialize reference data to the source data
      conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());

      if (status == Status::kSuccess) {
@ -884,7 +903,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
    // Run verification device reference
    if (options.verification.provider_enabled(library::Provider::kReferenceDevice)) {

-      // Restore reference data back to initial source data 
+      // Restore reference data back to initial source data
      conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());

      verify_with_device_reference_(
@ -893,13 +912,13 @@ bool Conv2dOperationProfiler::verify_cutlass(
        device_context,
        operation,
        problem_space,
-        problem);      
+        problem);
    }

    // Run verification host reference
    if (options.verification.provider_enabled(library::Provider::kReferenceHost)) {
-      
-      // Restore reference data back to initial source data 
+
+      // Restore reference data back to initial source data
      conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());

      verify_with_host_reference_(
@ -908,10 +927,10 @@ bool Conv2dOperationProfiler::verify_cutlass(
        device_context,
        operation,
        problem_space,
-        problem);      
+        problem);
    }

-    // Update disposition to worst case verification outcome among all 
+    // Update disposition to worst case verification outcome among all
    // verification providers which are supported
    bool is_any_verification_run_passed = false;
    for(auto &m : results_.back().verification_map) {
@ -936,7 +955,7 @@ bool Conv2dOperationProfiler::verify_cutlass(

 /// Verifies CUTLASS against host reference
 bool Conv2dOperationProfiler::verify_with_host_reference_(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -954,14 +973,14 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(

    library::ConvFunctionalKey conv2d_key(
      library::Provider::kReferenceHost,
-      conv_desc.conv_kind,        
+      conv_desc.conv_kind,
      conv_desc.A.element,
      conv_desc.A.layout,
      conv_desc.B.element,
      conv_desc.B.layout,
      conv_desc.C.element,
      conv_desc.C.layout,
-      conv_desc.tile_description.math_instruction.element_accumulator, 
+      conv_desc.tile_description.math_instruction.element_accumulator,
      conv_desc.element_epilogue);

 #if 0 // debug print to check which host reference instance is selected
@ -974,12 +993,12 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(

      results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
      return true;
-    }    
+    }

    // conv2d host reference minimum cc is 0 (CPU) and no iterator algorithm
    library::ConvPreferenceKey preference_key(0, library::IteratorAlgorithmID::kNone);
    auto cc_it = operators_it->second.find(preference_key);
-    
+
    if(cc_it == operators_it->second.end()) {
      results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
      return true;
@ -1052,9 +1071,9 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
    );

    // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
      results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) {
-  
+
      save_workspace(
        device_context,
        options,
@ -1070,7 +1089,7 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(

 /// Verifies CUTLASS against host reference
 bool Conv2dOperationProfiler::verify_with_device_reference_(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -1088,14 +1107,14 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(

    library::ConvFunctionalKey conv2d_key(
      library::Provider::kReferenceDevice,
-      conv_desc.conv_kind,      
+      conv_desc.conv_kind,
      conv_desc.A.element,
      conv_desc.A.layout,
      conv_desc.B.element,
      conv_desc.B.layout,
      conv_desc.C.element,
      conv_desc.C.layout,
-      conv_desc.tile_description.math_instruction.element_accumulator, 
+      conv_desc.tile_description.math_instruction.element_accumulator,
      conv_desc.element_epilogue);

    auto operators_it = Singleton::get().operation_table.conv2d_operations.find(conv2d_key);
@ -1105,12 +1124,12 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
      results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotRun;

      return true;
-    }    
+    }

    // conv2d device reference minimum cc is 50 and no iterator algorithm
    library::ConvPreferenceKey preference_key(50, library::IteratorAlgorithmID::kNone);
    auto cc_it = operators_it->second.find(preference_key);
-    
+
    if(cc_it == operators_it->second.end()) {
      results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotRun;

@ -1119,7 +1138,7 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(

    // device reference has only one instances in Conv2dOperationVectorMap
    library::Operation const *reference_op = cc_it->second[0];
-  
+
    //
    // Initialize device reference operation
    //
@ -1166,9 +1185,9 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
    );

    // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
      results_.back().verification_map[library::Provider::kReferenceDevice] == Disposition::kIncorrect) {
-  
+
      save_workspace(
        device_context,
        options,
@ -1183,14 +1202,14 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(

 /// Measures performance results
 bool Conv2dOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

-  
+
  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {

    // Initialize structure containing Conv2d arguments
@ -1242,7 +1261,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
  GpuTimer timer;

  // initialize conv2d underlying operation to handle parallel reduction
-  library::Operation const* underlying_operation = operation; 
+  library::Operation const* underlying_operation = operation;

  library::ConvArguments *conv_arguments = static_cast<library::ConvArguments *>(arguments);

@ -1274,7 +1293,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
    conv_arguments->B = conv_workspace_.B->batch_data(problem_idx);
    conv_arguments->C = conv_workspace_.C->batch_data(problem_idx);
    conv_arguments->D = conv_workspace_.Computed->batch_data(problem_idx);
-    
+
    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
      // update library::ConvArguments for parallel split-k reduction
      conv_arguments->D = conv_workspace_.device_workspace.data();
@ -1304,7 +1323,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
      return status;
    }
  }
-  
+
  //
  // Initialize GPU timer
  //
@ -1319,7 +1338,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(

  int iteration = 0;
  for (; iteration < Iterations; ++iteration) {
-    
+
    // Setup rotating workspace
    int problem_idx = (iteration % conv_workspace_.problem_count);

@ -1345,7 +1364,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
      device_workspace);

    // Run parallel reduction kernel for parallel split_k_mode
-    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {      
+    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {

      status = reduction_op_->run(
        &conv_workspace_.reduction_arguments,
@ -1367,7 +1386,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
  //
  // Update performance result
  //
-  
+
  runtime = timer.duration(iteration);

  return status;
@ -1378,13 +1397,13 @@ Status Conv2dOperationProfiler::profile_cutlass_(

 /// Verifies CUTLASS against cudnn reference
 bool Conv2dOperationProfiler::verify_with_cudnn_(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {
-  
+
  auto &conv_desc = static_cast<library::ConvDescription const &>(operation->description());

  //
@ -1395,7 +1414,7 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
  cudnnStatus_t status = handle.get_cudnn_create_status();

  if (status != CUDNN_STATUS_SUCCESS) {
-    
+
    results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status);
    return true;
  }
@ -1411,7 +1430,7 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
  conv_workspace_.arguments.alpha = problem_.alpha.data();
  conv_workspace_.arguments.beta = problem_.beta.data();
  conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
-      
+
  // cuDNN does not support four tensor arguments, so we copy the tensor C data into
  // tensor D.
  conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
@ -1423,8 +1442,8 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
    // Construct dispatcher to cudnn operator
    //

-    detail::cudnnConvDispatcher conv_op( 
-      conv_desc, 
+    detail::cudnnConvDispatcher conv_op(
+      conv_desc,
      conv_workspace_.configuration,
      conv_workspace_.arguments,
      handle
@ -1462,7 +1481,7 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
    );

    // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
      results_.back().verification_map[library::Provider::kCUDNN] == Disposition::kIncorrect) {

      save_workspace(
--- a/tools/profiler/src/conv3d_operation_profiler.cu
+++ b/tools/profiler/src/conv3d_operation_profiler.cu
@ -52,10 +52,10 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////

 /// Ctor
-Conv3dOperationProfiler::Conv3dOperationProfiler(Options const &options): 
+Conv3dOperationProfiler::Conv3dOperationProfiler(Options const &options):
  OperationProfiler(
    options,
-    library::OperationKind::kConv3d, 
+    library::OperationKind::kConv3d,
    {
      {ArgumentTypeID::kEnumerated, {"conv_kind"}, "Convolutional operator (fprop, dgrad, wgrad)"},
      {ArgumentTypeID::kInteger, {"n", "input_n"}, "Input N dimension of the Conv3d problem space"},
@ -170,7 +170,7 @@ int64_t Conv3dOperationProfiler::Conv3dProblem::flops(

  int64_t flops_mainloop_ = int64_t(mnk.m()) * mnk.n() * mnk.k() * 2;
  int64_t flops_epilogue_ = int64_t(mnk.m()) * int64_t(mnk.n()) * 2;
-  
+
  // Adjust mainloop flop for dgrad strided
  if (operation_desc.conv_kind == library::ConvKind::kDgrad) {
    flops_mainloop_ = flops_mainloop_ / ( stride_d * stride_h * stride_w);
@ -183,14 +183,14 @@ int64_t Conv3dOperationProfiler::Conv3dProblem::flops(

 /// Extracts the problem dimensions
 Status Conv3dOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

-  library::ConvDescription const &operation_desc = 
+  library::ConvDescription const &operation_desc =
    static_cast<library::ConvDescription const &>(operation->description());

  if (!arg_as_int(problem_.n, "n", problem_space, problem)) {
@ -207,7 +207,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
    // default value
    problem_.h = 14;
  }
-  
+
  if (!arg_as_int(problem_.w, "w", problem_space, problem)) {
    // default value
    problem_.w = 14;
@ -232,7 +232,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
    // default value
    problem_.r = 3;
  }
-  
+
  if (!arg_as_int(problem_.s, "s", problem_space, problem)) {
    // default value
    problem_.s = 3;
@ -294,25 +294,25 @@ Status Conv3dOperationProfiler::initialize_configuration(
  // cutlass profiler sets p and q which are cuDNN compliant.                           //
  //                                                                                    //
  ////////////////////////////////////////////////////////////////////////////////////////
-  // set convolution output z 
+  // set convolution output z
  if (!arg_as_int(problem_.z, "z", problem_space, problem)) {
    // default value (set using cudnn formula for output height, when p is not provided)
    problem_.z = (
-                    problem_.d + 
-                    2 * problem_.pad_d - 
+                    problem_.d +
+                    2 * problem_.pad_d -
                    ((problem_.t - 1) * problem_.dilation_d + 1)
-                 ) / (problem_.stride_d) 
+                 ) / (problem_.stride_d)
                + 1;
  }

-  // set convolution output p 
+  // set convolution output p
  if (!arg_as_int(problem_.p, "p", problem_space, problem)) {
    // default value (set using cudnn formula for output height, when p is not provided)
    problem_.p = (
-                    problem_.h + 
-                    2 * problem_.pad_h - 
+                    problem_.h +
+                    2 * problem_.pad_h -
                    ((problem_.r - 1) * problem_.dilation_h + 1)
-                 ) / (problem_.stride_h) 
+                 ) / (problem_.stride_h)
                + 1;
  }

@ -320,10 +320,10 @@ Status Conv3dOperationProfiler::initialize_configuration(
  if (!arg_as_int(problem_.q, "q", problem_space, problem)) {
    // default value (set using cudnn formula for output width, when q is not provided)
    problem_.q = (
-                    problem_.w + 
-                    2 * problem_.pad_w - 
+                    problem_.w +
+                    2 * problem_.pad_w -
                    ((problem_.s - 1) * problem_.dilation_w + 1)
-                 ) / (problem_.stride_w) 
+                 ) / (problem_.stride_w)
                + 1;
  }
  /////////////////////////////////////////////////////////////////////////////////////////
@ -338,7 +338,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
    // default value
    problem_.split_k_slices = 1;
  }
-  
+
  if (!arg_as_ConvModeID(problem_.conv_mode, "conv_mode", problem_space, problem)) {
    // default value
    problem_.conv_mode = library::ConvModeID::kCrossCorrelation;
@ -370,24 +370,24 @@ Status Conv3dOperationProfiler::initialize_configuration(
  }

  if (!arg_as_scalar(
-    problem_.alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    problem_.alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
    problem)) {

    if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) {
      return Status::kErrorInternal;
    }
  }
-  
+
  if (!arg_as_scalar(
-    problem_.beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    problem_.beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
    problem)) {
-    
+
    if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) {
      return Status::kErrorInternal;
    }
@ -420,25 +420,25 @@ Status Conv3dOperationProfiler::initialize_configuration(
                                                int(problem_.split_k_slices),
                                                1 // groups
                                              );
-  
+
  conv_workspace_.configuration.split_k_mode = static_cast<conv::SplitKMode>(static_cast<int>(problem_.split_k_mode));

  conv_workspace_.configuration.layout_activations.stride() = make_Coord(
-    int(problem_.c), 
+    int(problem_.c),
    int(problem_.w) * int(problem_.c),
    int(problem_.h) * int(problem_.w) * int(problem_.c),
    int(problem_.d) * int(problem_.h) * int(problem_.w) * int(problem_.c)
  );

  conv_workspace_.configuration.layout_filters.stride() = make_Coord(
-    int(problem_.c), 
+    int(problem_.c),
    int(problem_.s) * int(problem_.c),
    int(problem_.r) * int(problem_.s) * int(problem_.c),
    int(problem_.t) * int(problem_.r) * int(problem_.s) * int(problem_.c)
  );

  conv_workspace_.configuration.layout_output.stride() = make_Coord(
-    int(problem_.k), 
+    int(problem_.k),
    int(problem_.q) * int(problem_.k),
    int(problem_.q) * int(problem_.p) * int(problem_.k),
    int(problem_.z) * int(problem_.q) * int(problem_.p) * int(problem_.k)
@ -469,7 +469,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
 /// Initializes the performance result
 void Conv3dOperationProfiler::initialize_result_(
  PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
  library::ConvDescription const &operation_desc,
  ProblemSpace const &problem_space) {

@ -481,15 +481,15 @@ void Conv3dOperationProfiler::initialize_result_(
  result.arguments.resize(problem_space.rank());

  set_argument(result, "Activation", problem_space,
-    std::string(library::to_string(operation_desc.activation().element)) 
+    std::string(library::to_string(operation_desc.activation().element))
    + ":" + library::to_string(operation_desc.activation().layout));

  set_argument(result, "Filter", problem_space,
-    std::string(library::to_string(operation_desc.filter().element)) 
+    std::string(library::to_string(operation_desc.filter().element))
    + ":" + library::to_string(operation_desc.filter().layout));

  set_argument(result, "Output", problem_space,
-    std::string(library::to_string(operation_desc.output().element)) 
+    std::string(library::to_string(operation_desc.output().element))
    + ":" + library::to_string(operation_desc.output().layout));

  set_argument(result, "conv_kind", problem_space, library::to_string(operation_desc.conv_kind));
@ -506,7 +506,7 @@ void Conv3dOperationProfiler::initialize_result_(
  set_argument(result, "t", problem_space, problem_.t);
  set_argument(result, "r", problem_space, problem_.r);
  set_argument(result, "s", problem_space, problem_.s);
-  
+
  set_argument(result, "z", problem_space, problem_.z);
  set_argument(result, "p", problem_space, problem_.p);
  set_argument(result, "q", problem_space, problem_.q);
@ -523,11 +523,11 @@ void Conv3dOperationProfiler::initialize_result_(
  set_argument(result, "dilation_h", problem_space, problem_.dilation_h);
  set_argument(result, "dilation_w", problem_space, problem_.dilation_w);

-  set_argument(result, "split_k_mode", problem_space, 
+  set_argument(result, "split_k_mode", problem_space,
    std::string(library::to_string(problem_.split_k_mode)));
  set_argument(result, "split_k_slices", problem_space, problem_.split_k_slices);

-  set_argument(result, "conv_mode", problem_space, 
+  set_argument(result, "conv_mode", problem_space,
    std::string(library::to_string(problem_.conv_mode)));

  set_argument(result, "alpha", problem_space,
@ -536,7 +536,7 @@ void Conv3dOperationProfiler::initialize_result_(
  set_argument(result, "beta", problem_space,
    library::lexical_cast(problem_.beta, operation_desc.element_epilogue));

-  set_argument(result, "eq_gemm_provider", problem_space, 
+  set_argument(result, "eq_gemm_provider", problem_space,
    std::string(library::to_string(problem_.eq_gemm_provider)));

  OperationProfiler::initialize_result_(result, operation_desc, problem_space);
@ -554,14 +554,14 @@ void Conv3dOperationProfiler::initialize_result_(

 /// Initialize reduction problem dimensions and library::Operation
 bool Conv3dOperationProfiler::initialize_reduction_configuration_(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

-  library::ConvDescription const &conv_desc = 
+  library::ConvDescription const &conv_desc =
    static_cast<library::ConvDescription const &>(operation->description());

  library::ConvKind const &conv_kind = conv_desc.conv_kind;
@ -585,14 +585,14 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(
  conv_workspace_.reduction_configuration.lds              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
  conv_workspace_.reduction_configuration.ldd              = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];

-  // find reduction operation 
+  // find reduction operation
  library::ReductionFunctionalKey reduction_key(
    library::Provider::kCUTLASS,
-    conv_desc.tile_description.math_instruction.element_accumulator,  // element workspace 
+    conv_desc.tile_description.math_instruction.element_accumulator,  // element workspace
    conv_desc.tile_description.math_instruction.element_accumulator,  // element accumulator
    conv_desc.C.element,                                              // element output
    conv_desc.element_epilogue                                        // element compute
-  ); 
+  );

 #if 0// debug print to check which reduction instance is selected
    std::cout << reduction_key << "\n";
@ -602,7 +602,7 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(
  if(reduction_it == Singleton::get().operation_table.reduction_operations.end()) {

    return false;
-  }    
+  }

  // initialize reduction operation required for parallel split-k conv2d operator
  reduction_op_ = reduction_it->second;
@ -614,13 +614,24 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(

 /// Initializes workspace
 Status Conv3dOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
  // initialize conv2d underlying operation to handle parallel reduction
  library::Operation const* underlying_operation = operation;

@ -630,15 +641,15 @@ Status Conv3dOperationProfiler::initialize_workspace(
    }
  }

-  library::ConvDescription const &operation_desc = 
+  library::ConvDescription const &operation_desc =
    static_cast<library::ConvDescription const &>(underlying_operation->description());

  // Compute the number of copies of the problem to avoid L2 camping.
  if (!options.profiling.workspace_count) {
    int64_t bytes = problem_.bytes(operation_desc);
-    if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
+    if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
      conv_workspace_.problem_count =
-        1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
+        1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
    }
    else {
      conv_workspace_.problem_count = 1;
@ -651,7 +662,7 @@ Status Conv3dOperationProfiler::initialize_workspace(

  if (options.execution_mode != ExecutionMode::kDryRun) {
    int seed_shift = 0;
-    conv_workspace_.A = device_context.allocate_tensor(
+    conv_workspace_.A = device_context.allocate_and_initialize_tensor(
      options,
      "A",
      operation_desc.A.element,
@ -659,10 +670,11 @@ Status Conv3dOperationProfiler::initialize_workspace(
      problem_.extent_a(operation_desc.conv_kind),
      conv_workspace_.stride_a(operation_desc.conv_kind),
      conv_workspace_.problem_count,
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

-    conv_workspace_.B = device_context.allocate_tensor(
+    conv_workspace_.B = device_context.allocate_and_initialize_tensor(
      options,
      "B",
      operation_desc.B.element,
@ -670,10 +682,11 @@ Status Conv3dOperationProfiler::initialize_workspace(
      problem_.extent_b(operation_desc.conv_kind),
      conv_workspace_.stride_b(operation_desc.conv_kind),
      conv_workspace_.problem_count,
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

-    conv_workspace_.C = device_context.allocate_tensor(
+    conv_workspace_.C = device_context.allocate_and_initialize_tensor(
      options,
      "C",
      operation_desc.C.element,
@ -681,27 +694,32 @@ Status Conv3dOperationProfiler::initialize_workspace(
      problem_.extent_c(operation_desc.conv_kind),
      conv_workspace_.stride_c(operation_desc.conv_kind),
      conv_workspace_.problem_count,
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

    conv_workspace_.Computed = device_context.allocate_tensor(
+      options,
      "D",
      operation_desc.C.element,
      operation_desc.C.layout,
      problem_.extent_c(operation_desc.conv_kind),
      conv_workspace_.stride_c(operation_desc.conv_kind),
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      0 // device_index
    );

    conv_workspace_.Reference = device_context.allocate_tensor(
+      options,
      "Reference",
      operation_desc.C.element,
      operation_desc.C.layout,
      problem_.extent_c(operation_desc.conv_kind),
      conv_workspace_.stride_c(operation_desc.conv_kind),
-      conv_workspace_.problem_count
+      conv_workspace_.problem_count,
+      0 // device_index
    );
-    
+
  }

  //
@ -733,10 +751,10 @@ Status Conv3dOperationProfiler::initialize_workspace(
        conv_workspace_.reduction_host_workspace.resize(workspace_size, 0);

        status = reduction_op_->initialize(
-          &conv_workspace_.reduction_configuration, 
-          conv_workspace_.reduction_host_workspace.data(), 
+          &conv_workspace_.reduction_configuration,
+          conv_workspace_.reduction_host_workspace.data(),
          nullptr);
-        
+
        if (status != Status::kSuccess) {
          return status;
        }
@ -763,7 +781,7 @@ Status Conv3dOperationProfiler::initialize_workspace(

 /// Verifies CUTLASS against references
 bool Conv3dOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -784,7 +802,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
  set_cutlass_operator_arguments_();

  conv_workspace_.Computed->copy_from_device(conv_workspace_.C->data());
-  
+
  //
  // Run the CUTLASS operation
  //
@ -799,9 +817,9 @@ bool Conv3dOperationProfiler::verify_cutlass(
  }

 #if 0
-  std::cout << "profiling         : " << std::endl 
-            << "conv2d            : " << operation->description().name << std::endl 
-            << "underlying conv2d : " << underlying_operation->description().name << std::endl 
+  std::cout << "profiling         : " << std::endl
+            << "conv2d            : " << operation->description().name << std::endl
+            << "underlying conv2d : " << underlying_operation->description().name << std::endl
            << "reduction         : " << reduction_op_->description().name << std::endl;
 #endif

@ -818,7 +836,7 @@ bool Conv3dOperationProfiler::verify_cutlass(

  // Run parallel reduction kernel for parallel split_k_mode
  if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
-    
+
    results_.back().status = reduction_op_->run(
      &conv_workspace_.reduction_arguments,
      conv_workspace_.reduction_host_workspace.data(),
@ -840,7 +858,7 @@ bool Conv3dOperationProfiler::verify_cutlass(

  // CUTLASS op ran the but not yet verified against any verification provider
  results_.back().disposition = Disposition::kNotVerified;
-  
+
  //
  // Run verification providers
  //
@ -856,7 +874,7 @@ bool Conv3dOperationProfiler::verify_cutlass(

      Status status = cudnn_satisfies(conv_desc, conv_workspace_.configuration);

-      // Initialize reference data to the source data 
+      // Initialize reference data to the source data
      conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());

      if (status == Status::kSuccess) {
@ -883,8 +901,8 @@ bool Conv3dOperationProfiler::verify_cutlass(

    // Run verification host reference
    if (options.verification.provider_enabled(library::Provider::kReferenceHost)) {
-      
-      // Restore reference data back to initial source data 
+
+      // Restore reference data back to initial source data
      conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());

      verify_with_host_reference_(
@ -893,10 +911,10 @@ bool Conv3dOperationProfiler::verify_cutlass(
        device_context,
        operation,
        problem_space,
-        problem);      
+        problem);
    }

-    // Update disposition to worst case verification outcome among all 
+    // Update disposition to worst case verification outcome among all
    // verification providers which are supported
    bool is_any_verification_run_passed = false;
    for(auto &m : results_.back().verification_map) {
@ -921,7 +939,7 @@ bool Conv3dOperationProfiler::verify_cutlass(

 /// Verifies CUTLASS against host reference
 bool Conv3dOperationProfiler::verify_with_host_reference_(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -939,14 +957,14 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(

  library::ConvFunctionalKey conv_key(
    library::Provider::kReferenceHost,
-    conv_desc.conv_kind,        
+    conv_desc.conv_kind,
    conv_desc.A.element,
    conv_desc.A.layout,
    conv_desc.B.element,
    conv_desc.B.layout,
    conv_desc.C.element,
    conv_desc.C.layout,
-    conv_desc.tile_description.math_instruction.element_accumulator, 
+    conv_desc.tile_description.math_instruction.element_accumulator,
    conv_desc.element_epilogue);

 #if 0 // debug print to check which host reference instance is selected
@ -959,12 +977,12 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(

    results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
    return true;
-  }    
+  }

  // conv3d host reference minimum cc is 0 (CPU) and no iterator algorithm
  library::ConvPreferenceKey preference_key(0, library::IteratorAlgorithmID::kNone);
  auto cc_it = operators_it->second.find(preference_key);
-  
+
  if(cc_it == operators_it->second.end()) {
    results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
    return true;
@ -1035,9 +1053,9 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
  );

  // Save workspace if incorrect
-  if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+  if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
    results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) {
-  
+
    save_workspace(
      device_context,
      options,
@ -1053,7 +1071,7 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(

 /// Verifies CUTLASS against host reference
 bool Conv3dOperationProfiler::verify_with_device_reference_(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -1068,14 +1086,14 @@ bool Conv3dOperationProfiler::verify_with_device_reference_(

 /// Measures performance results
 bool Conv3dOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

-  
+
  if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {

    set_cutlass_operator_arguments_();
@ -1180,7 +1198,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
      return status;
    }
  }
-  
+
  //
  // Initialize GPU timer
  //
@ -1198,9 +1216,9 @@ Status Conv3dOperationProfiler::profile_cutlass_(

    // Setup rotating workspace
    int problem_idx = (iteration % conv_workspace_.problem_count);
- 
+
    set_cutlass_operator_arguments_(problem_idx);
- 
+
    // Run underlying conv2d operation
    status = underlying_operation->run(
      arguments,
@ -1208,7 +1226,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
      device_workspace);

    // Run parallel reduction kernel for parallel split_k_mode
-    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {      
+    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
      status = reduction_op_->run(
        &conv_workspace_.reduction_arguments,
        conv_workspace_.reduction_host_workspace.data(),
@ -1229,7 +1247,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
  //
  // Update performance result
  //
-  
+
  runtime = timer.duration(iteration);

  return status;
@ -1240,7 +1258,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(

 /// Verifies CUTLASS against cudnn reference
 bool Conv3dOperationProfiler::verify_with_cudnn_(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -1257,7 +1275,7 @@ bool Conv3dOperationProfiler::verify_with_cudnn_(
  cudnnStatus_t status = handle.get_cudnn_create_status();

  if (status != CUDNN_STATUS_SUCCESS) {
-    
+
    results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status);
    return true;
  }
@ -1285,8 +1303,8 @@ bool Conv3dOperationProfiler::verify_with_cudnn_(
    // Construct dispatcher to cudnn operator
    //

-    detail::cudnnConvDispatcher conv_op( 
-      conv_desc, 
+    detail::cudnnConvDispatcher conv_op(
+      conv_desc,
      conv_workspace_.configuration,
      conv_workspace_.arguments,
      handle
@ -1323,7 +1341,7 @@ bool Conv3dOperationProfiler::verify_with_cudnn_(
    );

    // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
      results_.back().verification_map[library::Provider::kCUDNN] == Disposition::kIncorrect) {

      save_workspace(
--- a/tools/profiler/src/cublas_helpers.cu
+++ b/tools/profiler/src/cublas_helpers.cu
@ -259,6 +259,25 @@ Status cublas_satisfies(library::GemmDescription const &desc) {
    return Status::kErrorNotSupported;
  }

+ // Refer to https://docs.nvidia.com/cuda/cublas/#id105
+ // input type A and B FE5M2 not supported in cuBLASLt
+  if(desc.A.element == library::NumericTypeID::kFE5M2 &&
+    desc.B.element == library::NumericTypeID::kFE5M2){
+
+    return Status::kErrorNotSupported;
+  }
+
+ // Refer to https://docs.nvidia.com/cuda/cublas/#id105
+ // input type A and B are FE5M2 and FE4M3 then D type should be F32
+  if (desc.A.element == library::NumericTypeID::kFE5M2 &&
+    desc.B.element == library::NumericTypeID::kFE4M3 &&
+    desc.C.element == library::NumericTypeID::kF32 &&
+    desc.D.element != library::NumericTypeID::kF32 ){
+
+    return Status::kErrorNotSupported;
+  }
+
+
  // output type S4 and S8 not supported in cuBLAS
  if (desc.C.element == library::NumericTypeID::kS4 || 
    desc.C.element == library::NumericTypeID::kS8) {
@ -405,7 +424,261 @@ cublasStatus_t cublasGemmExDispatcher::operator()(cublasHandle_t handle) {
  }
 }

-} // namespace detail
+
+cublasLtGemmExDispatcher::cublasLtGemmExDispatcher(
+  library::GemmDescription const &op_desc,
+  library::GemmUniversalConfiguration configuration_,
+  library::GemmUniversalArguments arguments_
+):
+  op_desc(op_desc), configuration(configuration_), arguments(arguments_), status(Status::kSuccess) {
+
+  bool good = true;
+
+  good = (good && get_cublas_transpose_operation(trans_A, op_desc.A.layout, op_desc.transform_A));
+  good = (good && get_cublas_transpose_operation(trans_B, op_desc.B.layout, op_desc.transform_B));
+  good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
+  good = (good && get_cublas_datatype(data_type_B, op_desc.B.element));
+  good = (good && get_cublas_datatype(data_type_C, op_desc.C.element));
+
+  good = (good && get_cublas_datatype(
+    compute_data_type,
+    op_desc.tile_description.math_instruction.element_accumulator));
+
+  // cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe
+  // internal numerical data types used in the computation.
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  library::OpcodeClassID const & opcode_class =
+    op_desc.tile_description.math_instruction.opcode_class;
+
+  if (good &&
+    op_desc.A.element == library::NumericTypeID::kF32 &&
+    op_desc.B.element == library::NumericTypeID::kF32 &&
+    opcode_class == library::OpcodeClassID::kTensorOp) {
+
+    compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
+  }
+  else if (good) {
+    bool const isPedantic = false;
+    switch (compute_data_type) {
+      case CUDA_R_32F:
+      case CUDA_C_32F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
+        break;
+      case CUDA_R_64F:
+      case CUDA_C_64F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
+        break;
+      case CUDA_R_16F:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
+        break;
+      case CUDA_R_32I:
+        compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
+        break;
+      default:
+        good = false;
+        break;
+    }
+  }
+#endif // __CUDACC_VER_MAJOR__ >= 11
+
+  if (!good) {
+    status = Status::kErrorNotSupported;
+  }
+}
+
+void cublasLtGemmExDispatcher::initialize_cublaslt(){
+
+  // create operation desciriptor; see cublasLtMatmulDescAttributes_t for details about defaults; here we just need to
+  // set the transforms for A and B
+  cublasLtMatmulDescCreate(&operationDesc, compute_type, compute_data_type);
+  cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_A, sizeof(trans_A));
+  cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_B, sizeof(trans_B));
+
+  uint64_t contiguous_A = (trans_A == CUBLAS_OP_N ? configuration.problem_size.m() : configuration.problem_size.k());
+  uint64_t strided_A = (trans_A == CUBLAS_OP_N ? configuration.problem_size.k() :  configuration.problem_size.m());
+  uint64_t contiguous_B = (trans_B == CUBLAS_OP_N ? configuration.problem_size.k() :  configuration.problem_size.n());
+  uint64_t strided_B = (trans_B == CUBLAS_OP_N ? configuration.problem_size.n() :  configuration.problem_size.k());
+
+  // create matrix descriptors, we are good with the details here so no need to set any extra attributes
+  // table of supported type combinations can be found in the documentation: https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmul
+  cublasLtMatrixLayoutCreate(&Adesc, data_type_A, contiguous_A, strided_A,  configuration.lda);
+  cublasLtMatrixLayoutCreate(&Bdesc, data_type_B, contiguous_B, strided_B,  configuration.ldb);
+  cublasLtMatrixLayoutCreate(&Cdesc, data_type_C, configuration.problem_size.m(), configuration.problem_size.n(), configuration.ldc);
+  cublasLtMatrixLayoutCreate(&Ddesc, data_type_C, configuration.problem_size.m(), configuration.problem_size.n(), configuration.ldd);
+
+}
+
+bool cublasLtGemmExDispatcher::get_cublaslt_algo(cublasLtHandle_t handle,
+                                 AlgorithmMode algorithm_mode
+                                 ){
+  const int requestedAlgoCount = 8; //By default gets 8 algorithms from GetHeuristic Call. CublasLt heuristics provide at max 8 algorithms. 
+  int returnedResults = 0;
+  cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {};
+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+  //Decide based upon the unique operation identifier whether to turn on fast accum for cublas kernel or not.
+  std::string operation_name(op_desc.name);
+  if(operation_name.find("fastaccum") != std::string::npos){
+    const int8_t fastAccuMode = 1;
+    cublasLtMatmulDescSetAttribute(operationDesc,
+        CUBLASLT_MATMUL_DESC_FAST_ACCUM,
+        &fastAccuMode,
+        sizeof(fastAccuMode));
+  }
+#endif // __CUDACC_VER_MAJOR__ >= 12
+
+  //Using 32MB for hopper kernel. This is the max workspace size for the call to cublasLtMatmulAlgoGetHeuristic()
+  size_t workspaceSizeForHeuristics = 32ULL * 1024 * 1024;
+  void* workspaceHeuristic = nullptr;
+
+  cudaError_t result = cudaMalloc((void **)&workspaceHeuristic, workspaceSizeForHeuristics);
+  if (result != cudaSuccess) {
+    throw std::bad_alloc();
+  }
+
+  // create preference handle; here we could use extra attributes to disable tensor ops or to make sure algo selected
+  // will work with badly aligned A, B, C; here for simplicity we just assume A,B,C are always well aligned (e.g.
+  // directly come from cudaMalloc)
+  cublasLtMatmulPreferenceCreate(&preference);
+  cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSizeForHeuristics, sizeof(workspaceSizeForHeuristics));
+
+  cublasLtMatmulAlgoGetHeuristic(handle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, preference, requestedAlgoCount, heuristicResult, &returnedResults);
+
+  if (returnedResults == 0) {
+    return false;
+  }
+
+  int bestAlgoIdx = 0;
+  //
+  //Auto Tuning to get the best kernel for the given problem
+  //
+  if (algorithm_mode == AlgorithmMode::kBest) {
+    float time = 0;
+    float bestAlgoTime = 0;
+    cudaStream_t stream;
+    cudaEvent_t startEvent, stopEvent;
+    
+    cudaStreamCreate(&stream);
+    cudaEventCreate(&startEvent);
+    cudaEventCreate(&stopEvent);
+      
+    constexpr int repeatAlgoCheck = 5;
+    std::vector<float> algoTimes(repeatAlgoCheck);
+    
+    for (int algoIdx = 0; algoIdx < returnedResults; algoIdx++) {
+      for (int checkIdx = 0; checkIdx < repeatAlgoCheck; checkIdx++) {
+        cudaEventRecord(startEvent, stream);
+  
+        cublasStatus_t status = cublasLtMatmul(handle,
+                 operationDesc,
+                 arguments.alpha,
+                 arguments.A,
+                 Adesc,
+                 arguments.B,
+                 Bdesc,
+                 arguments.beta,
+                 arguments.C,
+                 Cdesc,
+                 arguments.D,
+                 Ddesc,
+                 &heuristicResult[algoIdx].algo,
+                 workspaceHeuristic,
+                 heuristicResult[algoIdx].workspaceSize,
+                 stream);
+  
+        // Handle errors
+        if (status != CUBLAS_STATUS_SUCCESS) {
+          std::cerr << "cublasLtMatmul AutoTuning failed with status: " << cublasLtGetStatusName(status) << std::endl;
+          return false;
+        }
+  
+        cudaEventRecord(stopEvent, stream);
+        cudaEventSynchronize(stopEvent);
+        cudaEventElapsedTime(&time, startEvent, stopEvent);
+        algoTimes[checkIdx] = time;
+  
+      }
+  
+      const size_t size = algoTimes.size();
+      if (size == 0) {
+        time = 0;
+      }
+    
+      std::sort(algoTimes.begin(), algoTimes.end());
+    
+      const size_t mid = size / 2;
+      if (size % 2 == 0) {
+        time = (algoTimes[mid] + algoTimes[mid - 1]) / 2;
+      }
+      else {
+        time = algoTimes[mid];
+      }
+    
+      if (algoIdx == 0 || time < bestAlgoTime) {
+        bestAlgoTime = time;
+        bestAlgoIdx = algoIdx;
+      }
+    }
+  
+
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    std::cout << "\n";
+    std::cout << "# Algorithms checked: " << returnedResults << "\n";
+    std::cout << "WorkspaceSize Allocated: " << heuristicResult[bestAlgoIdx].workspaceSize << "\n";
+    std::cout << "Algorithm selected after auto-tuning is:" << "\n";
+    
+    int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme;
+  
+    cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
+    cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
+  
+    printf("algo={ Id=%d, tileIdx=%d splitK=%d reduc=%d swizzle=%d custom=%d }\n",
+        algoId, tile, numSplitsK, reductionScheme, swizzle, customOption);
+#endif
+
+    if (stream) cudaStreamDestroy(stream);
+    if (startEvent) cudaEventDestroy(startEvent);
+    if (stopEvent) cudaEventDestroy(stopEvent);
+
+  }
+
+  //setting algorithm for the dispatcher
+  heuristicResult_ = heuristicResult[bestAlgoIdx];
+  result = cudaMalloc((void **)&workspace, heuristicResult_.workspaceSize);
+  if (result != cudaSuccess) {
+    throw std::bad_alloc();
+  }
+  
+  return true;
+}
+
+cublasStatus_t cublasLtGemmExDispatcher::operator()(cublasLtHandle_t handle) 
+{
+  return cublasLtMatmul(handle,
+    operationDesc,
+    arguments.alpha,
+    arguments.A,
+    Adesc,
+    arguments.B,
+    Bdesc,
+    arguments.beta,
+    arguments.C,
+    Cdesc,
+    arguments.D,
+    Ddesc,
+    &heuristicResult_.algo,
+    workspace,
+    heuristicResult_.workspaceSize,
+    0); //number of streams is set to 0
+  
+}
+
+}
+// namespace detail

 /////////////////////////////////////////////////////////////////////////////////////////////////

--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@ -208,19 +208,6 @@ void CutlassProfiler::print_options_(std::ostream &out) {

 /////////////////////////////////////////////////////////////////////////////////////////////////

-/// Initializes the CUDA device
-void CutlassProfiler::initialize_device_() {
-
-  cudaError_t result = cudaSetDevice(options_.device.device);
-
-  if (result != cudaSuccess) {
-    std::cerr << "Failed to set device.";
-    throw std::runtime_error("Failed to set device");
-  }
-}
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
 } // namespace profiler
 } // namespace cutlass

--- a/tools/profiler/src/device_allocation.cu
+++ b/tools/profiler/src/device_allocation.cu
@ -88,16 +88,16 @@ static std::vector<int64_t> get_packed_layout_stride(std::vector<int> const &ext

 /// Returns the stride of a packed layout
 std::vector<int64_t> DeviceAllocation::get_packed_layout(
-  library::LayoutTypeID layout_id, 
+  library::LayoutTypeID layout_id,
  std::vector<int> const &extent) {

  std::vector<int64_t> stride;

  switch (layout_id) {
-    case library::LayoutTypeID::kColumnMajor: 
+    case library::LayoutTypeID::kColumnMajor:
      stride = get_packed_layout_stride<cutlass::layout::ColumnMajor>(extent);
      break;
-    case library::LayoutTypeID::kRowMajor: 
+    case library::LayoutTypeID::kRowMajor:
      stride = get_packed_layout_stride<cutlass::layout::RowMajor>(extent);
      break;
    case library::LayoutTypeID::kColumnMajorInterleavedK2:
@ -159,7 +159,7 @@ std::vector<int64_t> DeviceAllocation::get_packed_layout(

 /////////////////////////////////////////////////////////////////////////////////////////////////

-/// Template to use CUTLASS Layout functions to 
+/// Template to use CUTLASS Layout functions to
 template <typename Layout>
 static size_t construct_layout_(
  void *bytes,
@ -177,8 +177,8 @@ static size_t construct_layout_(
    stride = get_packed_layout_stride<Layout>(extent);

    return construct_layout_<Layout>(
-      bytes, 
-      layout_id, 
+      bytes,
+      layout_id,
      extent,
      stride);
  }
@ -202,7 +202,7 @@ static size_t construct_layout_(

  // Pack it into bytes
  if (bytes) {
-    *reinterpret_cast<Layout *>(bytes) = layout; 
+    *reinterpret_cast<Layout *>(bytes) = layout;
  }

  // Return capacity
@ -219,10 +219,10 @@ size_t DeviceAllocation::construct_layout(
  std::vector<int64_t> &stride) {

  switch (layout_id) {
-    case library::LayoutTypeID::kColumnMajor: 
+    case library::LayoutTypeID::kColumnMajor:
      return construct_layout_<cutlass::layout::ColumnMajor>(bytes, layout_id, extent, stride);
-      
-    case library::LayoutTypeID::kRowMajor: 
+
+    case library::LayoutTypeID::kRowMajor:
      return construct_layout_<cutlass::layout::RowMajor>(bytes, layout_id, extent, stride);

    case library::LayoutTypeID::kColumnMajorInterleavedK2:
@ -284,24 +284,26 @@ size_t DeviceAllocation::construct_layout(

 /////////////////////////////////////////////////////////////////////////////////////////////////

-DeviceAllocation::DeviceAllocation(): 
-  type_(library::NumericTypeID::kInvalid), 
+DeviceAllocation::DeviceAllocation():
+  type_(library::NumericTypeID::kInvalid),
  batch_stride_(0),
-  capacity_(0), 
+  capacity_(0),
  pointer_(nullptr),
  layout_(library::LayoutTypeID::kUnknown),
-  batch_count_(1) {
+  batch_count_(1),
+  device_(-1) {

 }

 DeviceAllocation::DeviceAllocation(
-  library::NumericTypeID type, 
-  size_t capacity
+  library::NumericTypeID type,
+  size_t capacity,
+  int device
 ):
-  type_(type), batch_stride_(capacity), capacity_(capacity), pointer_(nullptr), 
-  layout_(library::LayoutTypeID::kUnknown), batch_count_(1) {
+  type_(type), batch_stride_(capacity), capacity_(capacity), pointer_(nullptr),
+  layout_(library::LayoutTypeID::kUnknown), batch_count_(1), device_(device) {

-  cudaError_t result = cudaMalloc((void **)&pointer_, bytes(type, capacity));
+  cudaError_t result = this->malloc((void **)&pointer_, bytes(type, capacity));

  if (result != cudaSuccess) {
    type_ = library::NumericTypeID::kInvalid;
@ -312,13 +314,15 @@ DeviceAllocation::DeviceAllocation(
 }

 DeviceAllocation::DeviceAllocation(
-  library::NumericTypeID type, 
-  library::LayoutTypeID layout_id, 
-  std::vector<int> const &extent, 
+  library::NumericTypeID type,
+  library::LayoutTypeID layout_id,
+  std::vector<int> const &extent,
  std::vector<int64_t> const &stride,
-  int batch_count
+  int batch_count,
+  int device
 ):
-  type_(type), batch_stride_(size_t(0)), capacity_(size_t(0)), pointer_(nullptr), batch_count_(1) {
+  type_(type), batch_stride_(size_t(0)), capacity_(size_t(0)),
+  pointer_(nullptr), batch_count_(1), device_(device) {

  reset(type, layout_id, extent, stride, batch_count);
 }
@ -355,7 +359,7 @@ DeviceAllocation &DeviceAllocation::reset(library::NumericTypeID type, size_t ca
  batch_stride_ = capacity;
  capacity_ = capacity;

-  cudaError_t result = cudaMalloc((void **)&pointer_, bytes(type_, capacity_));
+  cudaError_t result = this->malloc((void **)&pointer_, bytes(type_, capacity_));
  if (result != cudaSuccess) {
    throw std::bad_alloc();
  }
@ -373,9 +377,9 @@ DeviceAllocation &DeviceAllocation::reset(library::NumericTypeID type, size_t ca

 /// Allocates memory for a given layout and tensor
 DeviceAllocation &DeviceAllocation::reset(
-  library::NumericTypeID type, 
-  library::LayoutTypeID layout_id, 
-  std::vector<int> const &extent, 
+  library::NumericTypeID type,
+  library::LayoutTypeID layout_id,
+  std::vector<int> const &extent,
  std::vector<int64_t> const &stride,
  int batch_count) {

@ -391,14 +395,14 @@ DeviceAllocation &DeviceAllocation::reset(
  batch_count_ = batch_count;

  batch_stride_ = construct_layout(
-    tensor_ref_buffer_.data() + sizeof(pointer_), 
-    layout_id, 
-    extent, 
+    tensor_ref_buffer_.data() + sizeof(pointer_),
+    layout_id,
+    extent,
    stride_);

  capacity_ = batch_stride_ * batch_count_;

-  cudaError_t result = cudaMalloc((void **)&pointer_, bytes(type, capacity_));
+  cudaError_t result = this->malloc((void **)&pointer_, bytes(type, capacity_));
  if (result != cudaSuccess) {
    throw std::bad_alloc();
  }
@ -421,7 +425,7 @@ void *DeviceAllocation::data() const {
 }

 void *DeviceAllocation::batch_data(int batch_idx) const {
-    return static_cast<char *>(data()) + batch_stride_bytes() * batch_idx; 
+    return static_cast<char *>(data()) + batch_stride_bytes() * batch_idx;
 }

 library::LayoutTypeID DeviceAllocation::layout() const {
@ -1476,159 +1480,159 @@ void DeviceAllocation::initialize_random_sparsemeta_host(int seed, int MetaSizeI

 /// Returns true if two blocks have exactly the same value
 bool DeviceAllocation::block_compare_equal(
-  library::NumericTypeID numeric_type, 
-  void const *ptr_A, 
-  void const *ptr_B, 
+  library::NumericTypeID numeric_type,
+  void const *ptr_A,
+  void const *ptr_B,
  size_t capacity) {

  switch (numeric_type) {
  case library::NumericTypeID::kFE4M3:
    return reference::device::BlockCompareEqual<float_e4m3_t>(
-      reinterpret_cast<float_e4m3_t const *>(ptr_A), 
-      reinterpret_cast<float_e4m3_t const *>(ptr_B), 
+      reinterpret_cast<float_e4m3_t const *>(ptr_A),
+      reinterpret_cast<float_e4m3_t const *>(ptr_B),
      capacity);
-    
+
  case library::NumericTypeID::kFE5M2:
    return reference::device::BlockCompareEqual<float_e5m2_t>(
      reinterpret_cast<float_e5m2_t const *>(ptr_A),
-      reinterpret_cast<float_e5m2_t const *>(ptr_B), 
+      reinterpret_cast<float_e5m2_t const *>(ptr_B),
      capacity);
  case library::NumericTypeID::kF16:
    return reference::device::BlockCompareEqual<half_t>(
-      reinterpret_cast<half_t const *>(ptr_A), 
-      reinterpret_cast<half_t const *>(ptr_B), 
+      reinterpret_cast<half_t const *>(ptr_A),
+      reinterpret_cast<half_t const *>(ptr_B),
      capacity);
-    
+
  case library::NumericTypeID::kBF16:
    return reference::device::BlockCompareEqual<bfloat16_t>(
-      reinterpret_cast<bfloat16_t const *>(ptr_A), 
-      reinterpret_cast<bfloat16_t const *>(ptr_B), 
+      reinterpret_cast<bfloat16_t const *>(ptr_A),
+      reinterpret_cast<bfloat16_t const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kTF32:
    return reference::device::BlockCompareEqual<tfloat32_t>(
-      reinterpret_cast<tfloat32_t const *>(ptr_A), 
-      reinterpret_cast<tfloat32_t const *>(ptr_B), 
+      reinterpret_cast<tfloat32_t const *>(ptr_A),
+      reinterpret_cast<tfloat32_t const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kF32:
    return reference::device::BlockCompareEqual<float>(
-      reinterpret_cast<float const *>(ptr_A), 
-      reinterpret_cast<float const *>(ptr_B), 
+      reinterpret_cast<float const *>(ptr_A),
+      reinterpret_cast<float const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kCF32:
    return reference::device::BlockCompareEqual<cutlass::complex<float> >(
-      reinterpret_cast<complex<float> const *>(ptr_A), 
-      reinterpret_cast<complex<float> const *>(ptr_B), 
+      reinterpret_cast<complex<float> const *>(ptr_A),
+      reinterpret_cast<complex<float> const *>(ptr_B),
      capacity);
-  
+
  case library::NumericTypeID::kCF16:
    return reference::device::BlockCompareEqual<complex<half_t>>(
-      reinterpret_cast<complex<half_t> const *>(ptr_A), 
-      reinterpret_cast<complex<half_t> const *>(ptr_B), 
+      reinterpret_cast<complex<half_t> const *>(ptr_A),
+      reinterpret_cast<complex<half_t> const *>(ptr_B),
      capacity);
-    
+
  case library::NumericTypeID::kCBF16:
    return reference::device::BlockCompareEqual<complex<bfloat16_t>>(
-      reinterpret_cast<complex<bfloat16_t> const *>(ptr_A), 
-      reinterpret_cast<complex<bfloat16_t> const *>(ptr_B), 
+      reinterpret_cast<complex<bfloat16_t> const *>(ptr_A),
+      reinterpret_cast<complex<bfloat16_t> const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kCTF32:
    return reference::device::BlockCompareEqual<complex<tfloat32_t>>(
-      reinterpret_cast<complex<tfloat32_t> const *>(ptr_A), 
-      reinterpret_cast<complex<tfloat32_t> const *>(ptr_B), 
+      reinterpret_cast<complex<tfloat32_t> const *>(ptr_A),
+      reinterpret_cast<complex<tfloat32_t> const *>(ptr_B),
      capacity);
-  
+
  case library::NumericTypeID::kF64:
    return reference::device::BlockCompareEqual<double>(
-      reinterpret_cast<double const *>(ptr_A), 
-      reinterpret_cast<double const *>(ptr_B), 
+      reinterpret_cast<double const *>(ptr_A),
+      reinterpret_cast<double const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kCF64:
    return reference::device::BlockCompareEqual<complex<double>>(
-      reinterpret_cast<complex<double> const *>(ptr_A), 
-      reinterpret_cast<complex<double> const *>(ptr_B), 
+      reinterpret_cast<complex<double> const *>(ptr_A),
+      reinterpret_cast<complex<double> const *>(ptr_B),
      capacity);
-  
+
  case library::NumericTypeID::kS2:
    return reference::device::BlockCompareEqual<int2b_t>(
-      reinterpret_cast<int2b_t const *>(ptr_A), 
-      reinterpret_cast<int2b_t const *>(ptr_B), 
+      reinterpret_cast<int2b_t const *>(ptr_A),
+      reinterpret_cast<int2b_t const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kS4:
    return reference::device::BlockCompareEqual<int4b_t>(
-      reinterpret_cast<int4b_t const *>(ptr_A), 
-      reinterpret_cast<int4b_t const *>(ptr_B), 
+      reinterpret_cast<int4b_t const *>(ptr_A),
+      reinterpret_cast<int4b_t const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kS8:
    return reference::device::BlockCompareEqual<int8_t>(
-      reinterpret_cast<int8_t const *>(ptr_A), 
-      reinterpret_cast<int8_t const *>(ptr_B), 
+      reinterpret_cast<int8_t const *>(ptr_A),
+      reinterpret_cast<int8_t const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kS16:
    return reference::device::BlockCompareEqual<int16_t>(
-      reinterpret_cast<int16_t const *>(ptr_A), 
-      reinterpret_cast<int16_t const *>(ptr_B), 
+      reinterpret_cast<int16_t const *>(ptr_A),
+      reinterpret_cast<int16_t const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kS32:
    return reference::device::BlockCompareEqual<int32_t>(
-      reinterpret_cast<int32_t const *>(ptr_A), 
-      reinterpret_cast<int32_t const *>(ptr_B), 
+      reinterpret_cast<int32_t const *>(ptr_A),
+      reinterpret_cast<int32_t const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kS64:
    return reference::device::BlockCompareEqual<int64_t>(
-      reinterpret_cast<int64_t const *>(ptr_A), 
-      reinterpret_cast<int64_t const *>(ptr_B), 
+      reinterpret_cast<int64_t const *>(ptr_A),
+      reinterpret_cast<int64_t const *>(ptr_B),
      capacity);
-  
+
  case library::NumericTypeID::kB1:
    return reference::device::BlockCompareEqual<uint1b_t>(
-      reinterpret_cast<uint1b_t const *>(ptr_A), 
-      reinterpret_cast<uint1b_t const *>(ptr_B), 
+      reinterpret_cast<uint1b_t const *>(ptr_A),
+      reinterpret_cast<uint1b_t const *>(ptr_B),
      capacity);
-  
+
  case library::NumericTypeID::kU2:
    return reference::device::BlockCompareEqual<uint2b_t>(
-      reinterpret_cast<uint2b_t const *>(ptr_A), 
-      reinterpret_cast<uint2b_t const *>(ptr_B), 
+      reinterpret_cast<uint2b_t const *>(ptr_A),
+      reinterpret_cast<uint2b_t const *>(ptr_B),
      capacity);
-  
+
  case library::NumericTypeID::kU4:
    return reference::device::BlockCompareEqual<uint4b_t>(
-      reinterpret_cast<uint4b_t const *>(ptr_A), 
-      reinterpret_cast<uint4b_t const *>(ptr_B), 
+      reinterpret_cast<uint4b_t const *>(ptr_A),
+      reinterpret_cast<uint4b_t const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kU8:
    return reference::device::BlockCompareEqual<uint8_t>(
-      reinterpret_cast<uint8_t const *>(ptr_A), 
-      reinterpret_cast<uint8_t const *>(ptr_B), 
+      reinterpret_cast<uint8_t const *>(ptr_A),
+      reinterpret_cast<uint8_t const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kU16:
    return reference::device::BlockCompareEqual<uint16_t>(
-      reinterpret_cast<uint16_t const *>(ptr_A), 
-      reinterpret_cast<uint16_t const *>(ptr_B), 
+      reinterpret_cast<uint16_t const *>(ptr_A),
+      reinterpret_cast<uint16_t const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kU32:
    return reference::device::BlockCompareEqual<uint32_t>(
-      reinterpret_cast<uint32_t const *>(ptr_A), 
-      reinterpret_cast<uint32_t const *>(ptr_B), 
+      reinterpret_cast<uint32_t const *>(ptr_A),
+      reinterpret_cast<uint32_t const *>(ptr_B),
      capacity);

  case library::NumericTypeID::kU64:
    return reference::device::BlockCompareEqual<uint64_t>(
-      reinterpret_cast<uint64_t const *>(ptr_A), 
-      reinterpret_cast<uint64_t const *>(ptr_B), 
+      reinterpret_cast<uint64_t const *>(ptr_A),
+      reinterpret_cast<uint64_t const *>(ptr_B),
      capacity);

  default:
@ -1638,9 +1642,9 @@ bool DeviceAllocation::block_compare_equal(

 /// Returns true if two blocks have approximately the same value
 bool DeviceAllocation::block_compare_relatively_equal(
-  library::NumericTypeID numeric_type, 
-  void const *ptr_A, 
-  void const *ptr_B, 
+  library::NumericTypeID numeric_type,
+  void const *ptr_A,
+  void const *ptr_B,
  size_t capacity,
  double epsilon,
  double nonzero_floor) {
@ -1648,161 +1652,161 @@ bool DeviceAllocation::block_compare_relatively_equal(
  switch (numeric_type) {
  case library::NumericTypeID::kFE4M3:
    return reference::device::BlockCompareRelativelyEqual<float_e4m3_t>(
-      reinterpret_cast<float_e4m3_t const *>(ptr_A), 
+      reinterpret_cast<float_e4m3_t const *>(ptr_A),
      reinterpret_cast<float_e4m3_t const *>(ptr_B),
-      capacity, 
-      static_cast<float_e4m3_t>(epsilon), 
+      capacity,
+      static_cast<float_e4m3_t>(epsilon),
      static_cast<float_e4m3_t>(nonzero_floor));
-    
+
  case library::NumericTypeID::kFE5M2:
    return reference::device::BlockCompareRelativelyEqual<float_e5m2_t>(
-      reinterpret_cast<float_e5m2_t const *>(ptr_A), 
+      reinterpret_cast<float_e5m2_t const *>(ptr_A),
      reinterpret_cast<float_e5m2_t const *>(ptr_B),
-      capacity, 
-      static_cast<float_e5m2_t>(epsilon), 
+      capacity,
+      static_cast<float_e5m2_t>(epsilon),
      static_cast<float_e5m2_t>(nonzero_floor));
  case library::NumericTypeID::kF16:
    return reference::device::BlockCompareRelativelyEqual<half_t>(
-      reinterpret_cast<half_t const *>(ptr_A), 
+      reinterpret_cast<half_t const *>(ptr_A),
      reinterpret_cast<half_t const *>(ptr_B),
-      capacity, 
-      static_cast<half_t>(epsilon), 
+      capacity,
+      static_cast<half_t>(epsilon),
      static_cast<half_t>(nonzero_floor));
-    
+
  case library::NumericTypeID::kBF16:
    return reference::device::BlockCompareRelativelyEqual<bfloat16_t>(
-      reinterpret_cast<bfloat16_t const *>(ptr_A), 
+      reinterpret_cast<bfloat16_t const *>(ptr_A),
      reinterpret_cast<bfloat16_t const *>(ptr_B),
-      capacity, 
-      static_cast<bfloat16_t>(epsilon), 
+      capacity,
+      static_cast<bfloat16_t>(epsilon),
      static_cast<bfloat16_t>(nonzero_floor));

  case library::NumericTypeID::kTF32:
    return reference::device::BlockCompareRelativelyEqual<tfloat32_t>(
-      reinterpret_cast<tfloat32_t const *>(ptr_A), 
+      reinterpret_cast<tfloat32_t const *>(ptr_A),
      reinterpret_cast<tfloat32_t const *>(ptr_B),
-      capacity, 
-      static_cast<tfloat32_t>(epsilon), 
+      capacity,
+      static_cast<tfloat32_t>(epsilon),
      static_cast<tfloat32_t>(nonzero_floor));

  case library::NumericTypeID::kF32:
    return reference::device::BlockCompareRelativelyEqual<float>(
-      reinterpret_cast<float const *>(ptr_A), 
+      reinterpret_cast<float const *>(ptr_A),
      reinterpret_cast<float const *>(ptr_B),
-      capacity, 
-      static_cast<float>(epsilon), 
+      capacity,
+      static_cast<float>(epsilon),
      static_cast<float>(nonzero_floor));

  case library::NumericTypeID::kF64:
    return reference::device::BlockCompareRelativelyEqual<double>(
-      reinterpret_cast<double const *>(ptr_A), 
+      reinterpret_cast<double const *>(ptr_A),
      reinterpret_cast<double const *>(ptr_B),
-      capacity, 
-      static_cast<double>(epsilon), 
+      capacity,
+      static_cast<double>(epsilon),
      static_cast<double>(nonzero_floor));
-  
+
  case library::NumericTypeID::kS2:
    return reference::device::BlockCompareRelativelyEqual<int2b_t>(
-      reinterpret_cast<int2b_t const *>(ptr_A), 
+      reinterpret_cast<int2b_t const *>(ptr_A),
      reinterpret_cast<int2b_t const *>(ptr_B),
-      capacity, 
-      static_cast<int2b_t>(epsilon), 
+      capacity,
+      static_cast<int2b_t>(epsilon),
      static_cast<int2b_t>(nonzero_floor));
-  
+
  case library::NumericTypeID::kS4:
    return reference::device::BlockCompareRelativelyEqual<int4b_t>(
-      reinterpret_cast<int4b_t const *>(ptr_A), 
+      reinterpret_cast<int4b_t const *>(ptr_A),
      reinterpret_cast<int4b_t const *>(ptr_B),
-      capacity, 
-      static_cast<int4b_t>(epsilon), 
+      capacity,
+      static_cast<int4b_t>(epsilon),
      static_cast<int4b_t>(nonzero_floor));

  case library::NumericTypeID::kS8:
    return reference::device::BlockCompareRelativelyEqual<int8_t>(
-      reinterpret_cast<int8_t const *>(ptr_A), 
+      reinterpret_cast<int8_t const *>(ptr_A),
      reinterpret_cast<int8_t const *>(ptr_B),
-      capacity, 
-      static_cast<int8_t>(epsilon), 
+      capacity,
+      static_cast<int8_t>(epsilon),
      static_cast<int8_t>(nonzero_floor));

  case library::NumericTypeID::kS16:
    return reference::device::BlockCompareRelativelyEqual<int16_t>(
-      reinterpret_cast<int16_t const *>(ptr_A), 
+      reinterpret_cast<int16_t const *>(ptr_A),
      reinterpret_cast<int16_t const *>(ptr_B),
-      capacity, 
-      static_cast<int16_t>(epsilon), 
+      capacity,
+      static_cast<int16_t>(epsilon),
      static_cast<int16_t>(nonzero_floor));

  case library::NumericTypeID::kS32:
    return reference::device::BlockCompareRelativelyEqual<int32_t>(
-      reinterpret_cast<int32_t const *>(ptr_A), 
+      reinterpret_cast<int32_t const *>(ptr_A),
      reinterpret_cast<int32_t const *>(ptr_B),
-      capacity, 
-      static_cast<int32_t>(epsilon), 
+      capacity,
+      static_cast<int32_t>(epsilon),
      static_cast<int32_t>(nonzero_floor));

  case library::NumericTypeID::kS64:
    return reference::device::BlockCompareRelativelyEqual<int64_t>(
-      reinterpret_cast<int64_t const *>(ptr_A), 
+      reinterpret_cast<int64_t const *>(ptr_A),
      reinterpret_cast<int64_t const *>(ptr_B),
-      capacity, 
-      static_cast<int64_t>(epsilon), 
+      capacity,
+      static_cast<int64_t>(epsilon),
      static_cast<int64_t>(nonzero_floor));
-  
+
  case library::NumericTypeID::kB1:
    return reference::device::BlockCompareRelativelyEqual<uint1b_t>(
-      reinterpret_cast<uint1b_t const *>(ptr_A), 
+      reinterpret_cast<uint1b_t const *>(ptr_A),
      reinterpret_cast<uint1b_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint1b_t>(epsilon), 
+      capacity,
+      static_cast<uint1b_t>(epsilon),
      static_cast<uint1b_t>(nonzero_floor));

  case library::NumericTypeID::kU2:
    return reference::device::BlockCompareRelativelyEqual<uint2b_t>(
-      reinterpret_cast<uint2b_t const *>(ptr_A), 
+      reinterpret_cast<uint2b_t const *>(ptr_A),
      reinterpret_cast<uint2b_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint2b_t>(epsilon), 
+      capacity,
+      static_cast<uint2b_t>(epsilon),
      static_cast<uint2b_t>(nonzero_floor));

  case library::NumericTypeID::kU4:
    return reference::device::BlockCompareRelativelyEqual<uint4b_t>(
-      reinterpret_cast<uint4b_t const *>(ptr_A), 
+      reinterpret_cast<uint4b_t const *>(ptr_A),
      reinterpret_cast<uint4b_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint4b_t>(epsilon), 
+      capacity,
+      static_cast<uint4b_t>(epsilon),
      static_cast<uint4b_t>(nonzero_floor));

  case library::NumericTypeID::kU8:
    return reference::device::BlockCompareRelativelyEqual<uint8_t>(
-      reinterpret_cast<uint8_t const *>(ptr_A), 
+      reinterpret_cast<uint8_t const *>(ptr_A),
      reinterpret_cast<uint8_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint8_t>(epsilon), 
+      capacity,
+      static_cast<uint8_t>(epsilon),
      static_cast<uint8_t>(nonzero_floor));

  case library::NumericTypeID::kU16:
    return reference::device::BlockCompareRelativelyEqual<uint16_t>(
-      reinterpret_cast<uint16_t const *>(ptr_A), 
+      reinterpret_cast<uint16_t const *>(ptr_A),
      reinterpret_cast<uint16_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint16_t>(epsilon), 
+      capacity,
+      static_cast<uint16_t>(epsilon),
      static_cast<uint16_t>(nonzero_floor));

  case library::NumericTypeID::kU32:
    return reference::device::BlockCompareRelativelyEqual<uint32_t>(
-      reinterpret_cast<uint32_t const *>(ptr_A), 
+      reinterpret_cast<uint32_t const *>(ptr_A),
      reinterpret_cast<uint32_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint32_t>(epsilon), 
+      capacity,
+      static_cast<uint32_t>(epsilon),
      static_cast<uint32_t>(nonzero_floor));

  case library::NumericTypeID::kU64:
    return reference::device::BlockCompareRelativelyEqual<uint64_t>(
-      reinterpret_cast<uint64_t const *>(ptr_A), 
+      reinterpret_cast<uint64_t const *>(ptr_A),
      reinterpret_cast<uint64_t const *>(ptr_B),
-      capacity, 
-      static_cast<uint64_t>(epsilon), 
+      capacity,
+      static_cast<uint64_t>(epsilon),
      static_cast<uint64_t>(nonzero_floor));

  // No relatively equal comparison for complex numbers.
@ -1821,7 +1825,7 @@ bool DeviceAllocation::block_compare_relatively_equal(
      reinterpret_cast<complex<float> const *>(ptr_A),
      reinterpret_cast<complex<float> const *>(ptr_B),
      capacity);
-  
+
  case library::NumericTypeID::kCF64:
    return reference::device::BlockCompareEqual<cutlass::complex<double> >(
      reinterpret_cast<complex<double> const *>(ptr_A),
@ -1837,14 +1841,14 @@ bool DeviceAllocation::block_compare_relatively_equal(

 /////////////////////////////////////////////////////////////////////////////////////////////////

-/// Permits copying dynamic vectors into static-length vectors 
+/// Permits copying dynamic vectors into static-length vectors
 template <typename TensorCoord, int Rank>
 struct vector_to_coord {
-  
+
  vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {

    coord[Rank - 1] = vec.at(Rank - 1);
-    
+
    if (Rank > 1) {
      vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
    }
@ -1853,17 +1857,17 @@ struct vector_to_coord {
  vector_to_coord(TensorCoord &coord, std::vector<int64_t> const &vec) {

    coord[Rank - 1] = (int)vec.at(Rank - 1);
-    
+
    if (Rank > 1) {
      vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
    }
  }
 };

-/// Permits copying dynamic vectors into static-length vectors 
+/// Permits copying dynamic vectors into static-length vectors
 template <typename TensorCoord>
 struct vector_to_coord<TensorCoord, 1> {
-  
+
  vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {

    coord[0] = vec.at(0);
@ -1875,10 +1879,10 @@ struct vector_to_coord<TensorCoord, 1> {
  }
 };

-/// Permits copying dynamic vectors into static-length vectors 
+/// Permits copying dynamic vectors into static-length vectors
 template <typename TensorCoord>
 struct vector_to_coord<TensorCoord, 0> {
-  
+
  vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {

  }
@ -1888,7 +1892,7 @@ struct vector_to_coord<TensorCoord, 0> {

 template <typename Element, typename Layout>
 static void write_tensor_csv_static_tensor_view(
-  std::ostream &out, 
+  std::ostream &out,
  DeviceAllocation &allocation) {

  Coord<Layout::kRank> extent;
@ -1903,7 +1907,7 @@ static void write_tensor_csv_static_tensor_view(
  }

  vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
-  vector_to_coord<Coord<Layout::kStrideRank, typename Layout::Stride::Index>, 
+  vector_to_coord<Coord<Layout::kStrideRank, typename Layout::Stride::Index>,
                        Layout::kStrideRank>(stride, allocation.stride());

  Layout layout(stride);
@ -1914,7 +1918,7 @@ static void write_tensor_csv_static_tensor_view(
  }

  host_tensor.copy_in_device_to_host(
-    static_cast<Element const *>(allocation.data()), 
+    static_cast<Element const *>(allocation.data()),
    allocation.batch_stride());

  TensorViewWrite(out, host_tensor.host_view());
@ -1926,7 +1930,7 @@ static void write_tensor_csv_static_tensor_view(

 template <typename T>
 static void write_tensor_csv_static_type(
-  std::ostream &out, 
+  std::ostream &out,
  DeviceAllocation &allocation) {

  switch (allocation.layout()) {
@ -1991,7 +1995,7 @@ static void write_tensor_csv_static_type(

 /////////////////////////////////////////////////////////////////////////////////////////////////

-/// Writes a tensor to csv 
+/// Writes a tensor to csv
 void DeviceAllocation::write_tensor_csv(
  std::ostream &out) {

@ -1999,14 +2003,14 @@ void DeviceAllocation::write_tensor_csv(
  case library::NumericTypeID::kFE4M3:
    write_tensor_csv_static_type<float_e4m3_t>(out, *this);
    break;
-  
+
  case library::NumericTypeID::kFE5M2:
    write_tensor_csv_static_type<float_e5m2_t>(out, *this);
    break;
  case library::NumericTypeID::kF16:
    write_tensor_csv_static_type<half_t>(out, *this);
    break;
-    
+
  case library::NumericTypeID::kBF16:
    write_tensor_csv_static_type<bfloat16_t>(out, *this);
    break;
@ -2022,7 +2026,7 @@ void DeviceAllocation::write_tensor_csv(
  case library::NumericTypeID::kF64:
    write_tensor_csv_static_type<double>(out, *this);
    break;
-  
+
  case library::NumericTypeID::kS2:
    write_tensor_csv_static_type<int2b_t>(out, *this);
    break;
@ -2046,7 +2050,7 @@ void DeviceAllocation::write_tensor_csv(
  case library::NumericTypeID::kS64:
    write_tensor_csv_static_type<int64_t>(out, *this);
    break;
-  
+
  case library::NumericTypeID::kB1:
    write_tensor_csv_static_type<uint1b_t>(out, *this);
    break;
@ -2074,7 +2078,7 @@ void DeviceAllocation::write_tensor_csv(
  case library::NumericTypeID::kU64:
    write_tensor_csv_static_type<uint64_t>(out, *this);
    break;
-  
+
  case library::NumericTypeID::kCF16:
    write_tensor_csv_static_type<cutlass::complex<half_t> >(out, *this);
    break;
@ -2110,7 +2114,7 @@ static void tensor_fill_tensor_view(DeviceAllocation &allocation, Element val =
  }

  vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
-  vector_to_coord<Coord<Layout::kStrideRank, typename Layout::LongIndex>, 
+  vector_to_coord<Coord<Layout::kStrideRank, typename Layout::LongIndex>,
                        Layout::kStrideRank>(stride, allocation.stride());

  TensorView<Element, Layout> view(
@ -2432,6 +2436,46 @@ void DeviceAllocation::fill_host(double val = 0.0) {
  copy_from_host(host_data.data());
 }

+cudaError_t DeviceAllocation::malloc(void** ptr, size_t size) {
+  cudaError_t result;
+  int set_device_back_to = -1;
+
+  /// When needed this sets the device to the allocation's device remembering
+  /// the current device so that it can be set back after the cudaMalloc is
+  /// performed.
+  if (device_ >= 0) {
+    int current_device;
+    result = cudaGetDevice(&current_device);
+    if (result != cudaSuccess) {
+      return result;
+    }
+
+    if (current_device != device_) {
+      set_device_back_to = current_device;
+      result = cudaSetDevice(device_);
+      if (result != cudaSuccess) {
+        return result;
+      }
+    }
+  }
+
+  // This performs the cudaMalloc
+  result = cudaMalloc(ptr, size);
+  if (result != cudaSuccess) {
+    return result;
+  }
+
+  /// When needed this sets the device back to what it was when the function was
+  /// called.
+  if (set_device_back_to != -1) {
+    result = cudaSetDevice(set_device_back_to);
+    if (result != cudaSuccess) {
+      return result;
+    }
+  }
+
+  return cudaSuccess;
+}

 /////////////////////////////////////////////////////////////////////////////////////////////////

--- a/tools/profiler/src/device_context.cu
+++ b/tools/profiler/src/device_context.cu
@ -29,7 +29,7 @@
 *
 **************************************************************************************************/
 /* \file
-   \brief 
+   \brief
 */

 #include "cutlass/profiler/device_context.h"
@ -41,29 +41,16 @@ namespace profiler {

 /// Allocates memory of a given type, capacity (elements), and name
 DeviceAllocation *DeviceContext::allocate_block(
+  Options const &options,
  std::string const &name,
-  library::NumericTypeID type, 
-  size_t capacity) {
+  library::NumericTypeID type,
+  size_t capacity,
+  size_t device_index) {

-  device_memory_.emplace_back(type, capacity);
+  int device = options.device.device_id(device_index);
+  device_memory_.emplace_back(type, capacity, device);
  DeviceAllocation *allocation = &device_memory_.back();
-  
-  allocations_[name] = allocation;
-  return allocation;
-}

-/// Allocates memory of a given type, capacity (elements), and name
-DeviceAllocation *DeviceContext::allocate_tensor(
-  std::string const &name,
-  library::NumericTypeID type, 
-  library::LayoutTypeID layout_id, 
-  std::vector<int> const &extent, 
-  std::vector<int64_t> const &stride,
-  int batch_count) {
-
-  device_memory_.emplace_back(type, layout_id, extent, stride, batch_count);
-  DeviceAllocation *allocation = &device_memory_.back();
-  
  allocations_[name] = allocation;
  return allocation;
 }
@ -72,18 +59,40 @@ DeviceAllocation *DeviceContext::allocate_tensor(
 DeviceAllocation *DeviceContext::allocate_tensor(
  Options const &options,
  std::string const &name,
-  library::NumericTypeID type, 
-  library::LayoutTypeID layout_id, 
-  std::vector<int> const &extent, 
+  library::NumericTypeID type,
+  library::LayoutTypeID layout_id,
+  std::vector<int> const &extent,
  std::vector<int64_t> const &stride,
  int batch_count,
-  int seed_shift) {
+  size_t device_index) {

-  DeviceAllocation *allocation = 
-    allocate_tensor(name, type, layout_id, extent, stride, batch_count);
+  int device = options.device.device_id(device_index);
+  device_memory_.emplace_back(type, layout_id, extent, stride, batch_count,
+                              device);
+  DeviceAllocation *allocation = &device_memory_.back();
+
+  allocations_[name] = allocation;
+  return allocation;
+}
+
+/// Allocates memory of a given type, capacity (elements), and name
+DeviceAllocation *DeviceContext::allocate_and_initialize_tensor(
+  Options const &options,
+  std::string const &name,
+  library::NumericTypeID type,
+  library::LayoutTypeID layout_id,
+  std::vector<int> const &extent,
+  std::vector<int64_t> const &stride,
+  int batch_count,
+  int seed_shift,
+  size_t device_index) {
+
+  DeviceAllocation *allocation =
+      allocate_tensor(options, name, type, layout_id, extent, stride,
+                      batch_count, device_index);

  if (options.initialization.enabled) {
-    Distribution data_distribution = options.initialization.data_distribution; 
+    Distribution data_distribution = options.initialization.data_distribution;

    // check if data distribution is allowed to change
    if(!options.initialization.fix_data_distribution) {
@ -129,13 +138,13 @@ DeviceAllocation *DeviceContext::allocate_tensor(
      double stddev = data_distribution.gaussian.stddev;
      int scale = data_distribution.int_scale;

-      if (name == "A" && data_distribution.gaussian.pnzA != 100.0) {
+      if (name == "A" && data_distribution.gaussian.pnzA != 1.0) {
        data_distribution.set_gaussian(mean, stddev, scale, data_distribution.gaussian.pnzA);
      }
-      else if (name == "B" && data_distribution.gaussian.pnzB != 100.0) {
+      else if (name == "B" && data_distribution.gaussian.pnzB != 1.0) {
        data_distribution.set_gaussian(mean, stddev, scale, data_distribution.gaussian.pnzB);
      }
-      else if (name == "C" && data_distribution.gaussian.pnzC != 100.0) {
+      else if (name == "C" && data_distribution.gaussian.pnzC != 1.0) {
        data_distribution.set_gaussian(mean, stddev, scale, data_distribution.gaussian.pnzC);
      }
    }
@ -147,7 +156,7 @@ DeviceAllocation *DeviceContext::allocate_tensor(
      }
      else {
        allocation->initialize_random_device(
-          options.initialization.seed + seed_shift, 
+          options.initialization.seed + seed_shift,
          data_distribution);
      }
    }
@ -158,7 +167,7 @@ DeviceAllocation *DeviceContext::allocate_tensor(
      }
      else {
        allocation->initialize_random_host(
-          options.initialization.seed + seed_shift, 
+          options.initialization.seed + seed_shift,
          data_distribution);
      }
    }
@ -167,20 +176,22 @@ DeviceAllocation *DeviceContext::allocate_tensor(
  return allocation;
 }

-/// Allocates memory for sparse meta data 
-DeviceAllocation *DeviceContext::allocate_sparsemeta_tensor(
+/// Allocates memory for sparse meta data
+DeviceAllocation *DeviceContext::allocate_and_initialize_sparsemeta_tensor(
  Options const &options,
  std::string const &name,
-  library::NumericTypeID type, 
-  library::LayoutTypeID layout_id, 
+  library::NumericTypeID type,
+  library::LayoutTypeID layout_id,
  library::NumericTypeID type_a,
-  std::vector<int> const &extent, 
+  std::vector<int> const &extent,
  std::vector<int64_t> const &stride,
  int batch_count,
-  int seed_shift) {
+  int seed_shift,
+  size_t device_index) {

-  DeviceAllocation *allocation = 
-    allocate_tensor(name, type, layout_id, extent, stride, batch_count);
+  DeviceAllocation *allocation =
+      allocate_tensor(options, name, type, layout_id, extent, stride,
+                      batch_count, device_index);

  if (options.initialization.enabled) {
    // TF32 has 4bit meta data.  The rest has 2bit.
@ -188,12 +199,12 @@ DeviceAllocation *DeviceContext::allocate_sparsemeta_tensor(

    if (options.initialization.provider == library::Provider::kReferenceDevice) {
      allocation->initialize_random_sparsemeta_device(
-        options.initialization.seed + seed_shift, 
+        options.initialization.seed + seed_shift,
        MetaSizeInBits);
    }
    else if (options.initialization.provider == library::Provider::kReferenceHost) {
      allocation->initialize_random_sparsemeta_host(
-        options.initialization.seed + seed_shift, 
+        options.initialization.seed + seed_shift,
        MetaSizeInBits);
    }
  }
--- a/tools/profiler/src/gemm_operation_profiler.cu
+++ b/tools/profiler/src/gemm_operation_profiler.cu
@ -39,6 +39,7 @@
 #include <vector>

 #include "cutlass/core_io.h"
+#include <cuda_runtime_api.h>

 #include "cutlass/profiler/cublas_helpers.h"
 #include "cutlass/profiler/gemm_operation_profiler.h"
@ -46,7 +47,6 @@
 #include "cutlass/library/singleton.h"
 #include "cutlass/library/library.h"
 #include "cutlass/library/handle.h"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////

 namespace cutlass {
@ -485,6 +485,17 @@ Status GemmOperationProfiler::initialize_workspace(
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
  library::Operation const* underlying_operation = operation;

  if (problem_.split_k_mode == library::SplitKMode::kParallel) {
@ -496,12 +507,14 @@ Status GemmOperationProfiler::initialize_workspace(
  library::GemmDescription const &operation_desc =
    static_cast<library::GemmDescription const &>(operation->description());

+  bool is_sparse = operation_desc.tile_description.math_instruction.opcode_class == cutlass::library::OpcodeClassID::kSparseTensorOp;
+
  // Compute the number of copies of the problem to avoid L2 camping.
  if (!options.profiling.workspace_count) {
    int64_t bytes = problem_.bytes(operation_desc);
-    if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
+    if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
      gemm_workspace_.problem_count =
-        1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
+        1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
    }
    else {
      gemm_workspace_.problem_count = 1;
@ -514,7 +527,7 @@ Status GemmOperationProfiler::initialize_workspace(
  bool allocate_device_tensors = options.execution_mode != ExecutionMode::kDryRun;
  if (allocate_device_tensors) {
    int seed_shift = 0;
-    gemm_workspace_.A = device_context.allocate_tensor(
+    gemm_workspace_.A = device_context.allocate_and_initialize_tensor(
      options,
      "A",
      operation_desc.A.element,
@ -522,10 +535,11 @@ Status GemmOperationProfiler::initialize_workspace(
      {int(problem_.m), int(problem_.k)},
      {int(problem_.lda)},
      problem_.batch_count * gemm_workspace_.problem_count,
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

-    gemm_workspace_.B = device_context.allocate_tensor(
+    gemm_workspace_.B = device_context.allocate_and_initialize_tensor(
      options,
      "B",
      operation_desc.B.element,
@ -533,10 +547,11 @@ Status GemmOperationProfiler::initialize_workspace(
      {int(problem_.k), int(problem_.n)},
      {int(problem_.ldb)},
      problem_.batch_count * gemm_workspace_.problem_count,
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

-    gemm_workspace_.C = device_context.allocate_tensor(
+    gemm_workspace_.C = device_context.allocate_and_initialize_tensor(
      options,
      "C",
      operation_desc.C.element,
@ -544,25 +559,30 @@ Status GemmOperationProfiler::initialize_workspace(
      {int(problem_.m), int(problem_.n)},
      {int(problem_.ldc)},
      problem_.batch_count * gemm_workspace_.problem_count,
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

    gemm_workspace_.Computed = device_context.allocate_tensor(
+      options,
      "D",
      operation_desc.D.element,
      operation_desc.D.layout,
      {int(problem_.m), int(problem_.n)},
      {int(problem_.ldc)},
-      problem_.batch_count * gemm_workspace_.problem_count
+      problem_.batch_count * gemm_workspace_.problem_count,
+      0 // device_index
    );

    gemm_workspace_.Reference = device_context.allocate_tensor(
+      options,
      "Reference",
      operation_desc.D.element,
      operation_desc.D.layout,
      {int(problem_.m), int(problem_.n)},
      {int(problem_.ldc)},
-      problem_.batch_count * gemm_workspace_.problem_count
+      problem_.batch_count * gemm_workspace_.problem_count,
+      0 // device_index
    );
  }

@ -580,7 +600,7 @@ Status GemmOperationProfiler::initialize_workspace(
    gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Computed->batch_stride();

    /* Query device SM count to pass onto the kernel as an argument, where needed */
-    gemm_workspace_.arguments.sm_count = options.device.properties.multiProcessorCount;
+    gemm_workspace_.arguments.sm_count = options.device.properties[0].multiProcessorCount;
  }

  //
@ -596,12 +616,34 @@ Status GemmOperationProfiler::initialize_workspace(

      workspace_size = underlying_operation->get_device_workspace_size(&gemm_workspace_.configuration,
                                                            &gemm_workspace_.arguments);
+      if (is_sparse) {
+        // sparse gemm get_device_workspace_size() only return device workspace size per iteration
+        // Needs to multiply it w/ number of iteration
+        workspace_size *= gemm_workspace_.problem_count;
+      }
      gemm_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);

-      status = underlying_operation->initialize(
-        &gemm_workspace_.configuration,
-        gemm_workspace_.host_workspace.data(),
-        gemm_workspace_.device_workspace.data());
+      // Convert to structure sparse contents here.
+      if (is_sparse) {
+        uint8_t* profiler_workspaces[1];
+        profiler_workspaces[0] = reinterpret_cast<uint8_t*>(gemm_workspace_.A->data());
+        // Sparse operations have a different initialize interface.
+        // initialize_with_profiler_workspace converts mxk tensorA to compressed mxk/sp tensorA and the tensorE
+        auto modifiable_underlying_op = const_cast<library::Operation*>(underlying_operation);
+        status = modifiable_underlying_op->initialize_with_profiler_workspace(
+          &gemm_workspace_.configuration,
+          gemm_workspace_.host_workspace.data(),
+          gemm_workspace_.device_workspace.data(),
+          profiler_workspaces,
+          gemm_workspace_.problem_count);
+      }
+      else {
+        status = underlying_operation->initialize(
+          &gemm_workspace_.configuration,
+          gemm_workspace_.host_workspace.data(),
+          gemm_workspace_.device_workspace.data());
+      }
+
      if (status != Status::kSuccess) {
        return status;
      }
@ -821,26 +863,14 @@ bool GemmOperationProfiler::verify_with_cublas_(
  // Construct cuBLAS operators
  //

-  CublasCreate handle;
-  cublasStatus_t status = handle.get_cublas_create_status();
+  CublasLtCreate handle;
+  cublasStatus_t status = handle.get_cublaslt_create_status();

  if (status != CUBLAS_STATUS_SUCCESS) {
-
    results_.back().verification_map[library::Provider::kCUBLAS] = get_cutlass_disposition(status);
    return true;
  }

-  std::vector<cublasGemmAlgo_t> algorithms;
-
-  detail::select_cublas_algorithms(
-    algorithms,
-    options,
-    gemm_desc);
-
-  if (algorithms.empty()) {
-    // no algorithm selected
-    return true;
-  }

  //
  // Initialize state
@ -865,29 +895,34 @@ bool GemmOperationProfiler::verify_with_cublas_(
    gemm_workspace_.arguments.beta = problem_.beta.data();
    gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;

-    detail::cublasGemmExDispatcher gemm_op(
+    detail::cublasLtGemmExDispatcher gemm_op(
      gemm_desc,
      gemm_workspace_.configuration,
-      gemm_workspace_.arguments,
-      algorithms.front()
+      gemm_workspace_.arguments
    );

+    gemm_op.initialize_cublaslt();
+
+    if(!gemm_op.get_cublaslt_algo(handle, AlgorithmMode::kDefault)){
+      return true;
+    }
+
    if (gemm_op.status != Status::kSuccess) {
      results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotRun;
      return true;
    }

-    results_.back().status = Status::kSuccess;
-
    status = gemm_op(handle);

    // Handle errors
    if (status != CUBLAS_STATUS_SUCCESS) {
-
+      std::cerr << "cublasLt Verification run failed with status : " << cublasLtGetStatusName(status) << "\n";
      results_.back().verification_map[library::Provider::kCUBLAS] = get_cutlass_disposition(status);
      return true;
    }

+    results_.back().status = Status::kSuccess;
+
    //
    // Verify results
    //
@ -930,9 +965,9 @@ bool GemmOperationProfiler::verify_with_reference_(
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
-  ProblemSpace::Problem const &problem, 
-  cutlass::library::NumericTypeID element_A, 
-  cutlass::library::NumericTypeID element_B) 
+  ProblemSpace::Problem const &problem,
+  cutlass::library::NumericTypeID element_A,
+  cutlass::library::NumericTypeID element_B)
 {
  library::GemmDescription const &gemm_desc =
    static_cast<library::GemmDescription const &>(operation->description());
--- a/tools/profiler/src/operation_profiler.cu
+++ b/tools/profiler/src/operation_profiler.cu
@ -376,14 +376,14 @@ int OperationProfiler::profile_all(
        std::cerr << "    @ provider " << operation->description().provider
                  << " != library::Provider::kCUTLASS\n";
      }
-      if (options.device.compute_capability() < min_cc) {
+      if (options.device.compute_capability(0) < min_cc) {
        std::cerr << "    @ compute_capability "
-                  << options.device.compute_capability()
+                  << options.device.compute_capability(0)
                  << " < min_cc " << min_cc << "\n";
      }
-      if (options.device.compute_capability() > max_cc) {
+      if (options.device.compute_capability(0) > max_cc) {
        std::cerr << "    @ compute_capability "
-                  << options.device.compute_capability()
+                  << options.device.compute_capability(0)
                  << " > max_cc " << max_cc << "\n";
      }
 #endif
@ -391,8 +391,8 @@ int OperationProfiler::profile_all(
      // Execute compatible cutlass operations if they satisfy the current device's compute capability
      if (operation->description().kind == kind_ &&
          operation->description().provider == library::Provider::kCUTLASS &&
-          options.device.compute_capability() >= min_cc &&
-          options.device.compute_capability() <= max_cc) {
+          options.device.compute_capability(0) >= min_cc &&
+          options.device.compute_capability(0) <= max_cc) {

        std::string operation_name(operation->description().name);
        // Filter kernels by name
--- a/tools/profiler/src/options.cu
+++ b/tools/profiler/src/options.cu
@ -33,6 +33,7 @@
 */

 #include <algorithm>
+#include <set>

 #include "cutlass/cutlass.h"
 #include "cutlass/version.h"
@ -55,45 +56,97 @@ static char const *end_of_line = "\n

 Options::Device::Device(cutlass::CommandLine const &cmdline) {

-  cmdline.get_cmd_line_argument("device", device, 0);
-
+  // Gets the number of devices for future validation
  cudaError_t result;
-  result = cudaGetDeviceProperties(&properties, device);
-
+  result = cudaGetDeviceCount(&num_devices);
  if (result != cudaSuccess) {
-    throw std::runtime_error("cudaGetDeviceProperties() failed for given device");
+    throw std::runtime_error("cudaGetNumDevices() failed");
  }

-  result = cudaSetDevice(device);
-  if (result != cudaSuccess) {
-    throw std::runtime_error("cudaSetDevice() failed for given device.");
-  }
-
-  // Permit overriding the compute capability
-  if (cmdline.check_cmd_line_flag("compute-capability")) {
-    int cc = compute_capability();
-    cmdline.get_cmd_line_argument("compute-capability", cc, cc);
-    properties.major = cc / 10;
-    properties.minor = cc % 10;
-  }
-  
-  // Permit overriding the L2 cache capacity
-  if (cmdline.check_cmd_line_flag("llc-capacity")) {
-    int llc_capacity = 0;
-    cmdline.get_cmd_line_argument("llc-capacity", llc_capacity, 0);
-
-    if (llc_capacity >= 0) {
-      properties.l2CacheSize = (llc_capacity << 10);
+  // Gets the devices specified by the user
+  // This preserves the user specified order and checks for duplicates
+  {
+    std::vector<int> temp_device_list;
+    cmdline.get_cmd_line_arguments("devices", temp_device_list);
+    if (temp_device_list.empty()) {
+      temp_device_list.push_back(0);
+    }
+    {
+      std::set<int> temp_device_set;
+      for (int device : temp_device_list) {
+        auto res = temp_device_set.insert(device);
+        if (!res.second) {
+          throw std::runtime_error("Duplicate device specified: " +
+                                   std::to_string(device));
+        } else if (device > num_devices) {
+          throw std::runtime_error("Bad device ID: " +
+                                   std::to_string(device));
+        } else {
+          devices.push_back(device);
+        }
+      }
    }
  }

+  properties.resize(devices.size());
+  // Retrieves properties for all specified devices
+  for (size_t device_index = 0; device_index < devices.size(); device_index++) {
+    int device = devices[device_index];
+
+    result = cudaGetDeviceProperties(&properties[device_index], device);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed for given device");
+    }
+
+    // Check that all devices are the same
+    if (device_index > 0) {
+      if ((properties[device_index].major != properties[0].major) ||
+          (properties[device_index].minor != properties[0].minor)) {
+        throw std::runtime_error("All selected devices must have the same "
+                                 "compute capability");
+      }
+      if (properties[device_index].l2CacheSize != properties[0].l2CacheSize) {
+        throw std::runtime_error("All selected devices must have the same "
+                                 "L2 cache size");
+      }
+      if (properties[device_index].multiProcessorCount != properties[0].multiProcessorCount) {
+        throw std::runtime_error("All selected devices must have the same "
+                                 "SM count");
+      }
+    }
+
+    result = cudaSetDevice(device);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaSetDevice() failed for given device.");
+    }
+
+    // Permit overriding the compute capability
+    if (cmdline.check_cmd_line_flag("compute-capability")) {
+      int cc = compute_capability(device_index);
+      cmdline.get_cmd_line_argument("compute-capability", cc, cc);
+      properties[device_index].major = cc / 10;
+      properties[device_index].minor = cc % 10;
+    }
+
+    // Permit overriding the L2 cache capacity
+    if (cmdline.check_cmd_line_flag("llc-capacity")) {
+      int llc_capacity = 0;
+      cmdline.get_cmd_line_argument("llc-capacity", llc_capacity, 0);
+
+      if (llc_capacity >= 0) {
+        properties[device_index].l2CacheSize = (llc_capacity << 10);
+      }
+    }
+
+  }
 }

 void Options::Device::print_usage(std::ostream &out) const {

  out << "Device:\n"
-    << "  --device=<int>                               "
-    << "    CUDA Device ID\n\n";
+    << "  --devices=<int>,<int>,...                      "
+    << "    CUDA Device IDs\n\n";

  int device_count = 0;
  cudaError_t result = cudaGetDeviceCount(&device_count);
@ -111,11 +164,11 @@ void Options::Device::print_usage(std::ostream &out) const {
        break;
      }
      else {
-        out << "    [" << idx << "] - " 
-          << prop.name << " - SM " << prop.major << "." << prop.minor << ", " 
-          << prop.multiProcessorCount << " SMs @ " << (prop.clockRate / 1000.0) << " MHz, " 
+        out << "    [" << idx << "] - "
+          << prop.name << " - SM " << prop.major << "." << prop.minor << ", "
+          << prop.multiProcessorCount << " SMs @ " << (prop.clockRate / 1000.0) << " MHz, "
          << "L2 cache: " << (prop.l2CacheSize >> 20) << " MB, Global Memory: " << (prop.totalGlobalMem >> 30) << " GB"
-          << std::endl; 
+          << std::endl;
      }
    }
    out << "\n";
@ -133,15 +186,8 @@ void Options::Device::print_usage(std::ostream &out) const {
 }

 void Options::Device::print_device_info(std::ostream &out) const {
-  int num_devices;
  cudaDeviceProp props;
-
  cudaError_t result;
-  result = cudaGetDeviceCount(&num_devices);
-
-  if (result != cudaSuccess) {
-    throw std::runtime_error("cudaGetNumDevices() failed");
-  }

  out << "Device Name,SM,CUDA Device ID,Phy Device ID" << std::endl;

@ -165,14 +211,28 @@ void Options::Device::print_device_info(std::ostream &out) const {
 void Options::Device::print_options(std::ostream &out, int indent) const {

  out
-    << indent_str(indent) << "device: " << device << "\n"
-    << indent_str(indent) << "clock: " << int(double(properties.clockRate) / 1000.0) << "\n"
-    << indent_str(indent) << "compute-capability: " << compute_capability() << "\n";
+    << indent_str(indent) << "devices: ";
+  for (int device : devices) {
+    out << device << ',';
+  }
+  out
+    << "\n"
+    << indent_str(indent) << "clock: " << int(double(properties[0].clockRate) / 1000.0) << "\n"
+    << indent_str(indent) << "compute-capability: " << compute_capability(0) << "\n";
+}
+
+/// Returns the device ID from a device index
+int Options::Device::device_id(size_t device_index) const {
+  if (device_index > devices.size()) {
+    throw std::runtime_error("Out of bounds device index: " +
+                             std::to_string(device_index));
+  }
+  return devices.at(device_index);
 }

 /// Returns the compute capability of the listed device (e.g. 61, 60, 70, 75)
-int Options::Device::compute_capability() const {
-  return properties.major * 10 + properties.minor;
+int Options::Device::compute_capability(int device_index) const {
+  return properties[device_index].major * 10 + properties[device_index].minor;
 }

 /////////////////////////////////////////////////////////////////////////////////////////////////
@ -207,10 +267,10 @@ Options::Initialization::Initialization(cutlass::CommandLine const &cmdline) {
  else {
    // profiler chosen data distribution (allowed to change based on numeric types)
    fix_data_distribution = false;
-    // set uniform data distribution with range [-4, 4] 
+    // set uniform data distribution with range [-4, 4]
    data_distribution.set_uniform(-4, 4, 0);
  }
-  
+

 }

@ -248,10 +308,10 @@ void Options::Initialization::get_distribution(
  };

  // Initalize pnz values to a default value of 100%
-  dist.gaussian.pnz = 100.0;
-  dist.gaussian.pnzA = 100.0;
-  dist.gaussian.pnzB = 100.0;
-  dist.gaussian.pnzC = 100.0;
+  dist.gaussian.pnz = 1.0;
+  dist.gaussian.pnzA = 1.0;
+  dist.gaussian.pnzB = 1.0;
+  dist.gaussian.pnzC = 1.0;

  using KeyValueVector = std::vector<std::pair<std::string, std::string> >;

@ -335,7 +395,7 @@ Options::Library::Library(cutlass::CommandLine const &cmdline) {
    std::string mode = "default";
    cmdline.get_cmd_line_argument("library-algo-mode", mode);
    algorithm_mode = from_string<AlgorithmMode>(mode);
-  }  
+  }

  if (cmdline.check_cmd_line_flag("library-algos")) {

@ -353,7 +413,7 @@ Options::Library::Library(cutlass::CommandLine const &cmdline) {
      }
      else {
        int algo;
-        std::stringstream ss; 
+        std::stringstream ss;

        ss << token;
        ss >> algo;
@ -396,12 +456,12 @@ void Options::Library::print_options(std::ostream &out, int indent) const {

 Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {

-  cmdline.get_cmd_line_argument("workspace-count", workspace_count, 0);  
+  cmdline.get_cmd_line_argument("workspace-count", workspace_count, 0);
  cmdline.get_cmd_line_argument("warmup-iterations", warmup_iterations, 10);
  cmdline.get_cmd_line_argument("profiling-iterations", iterations, 100);
  cmdline.get_cmd_line_argument("sleep-duration", sleep_duration, 50);
  cmdline.get_cmd_line_argument("profiling-enabled", enabled, true);
-  
+
  if (cmdline.check_cmd_line_flag("providers")) {

    std::vector<std::string> tokens;
@ -416,7 +476,7 @@ Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {
  else {
    providers.push_back(library::Provider::kCUTLASS);
    providers.push_back(library::Provider::kCUBLAS);
-    providers.push_back(library::Provider::kCUDNN);      
+    providers.push_back(library::Provider::kCUDNN);
  }
 }

@ -480,7 +540,7 @@ size_t Options::Profiling::index(library::Provider provider) const {
 /////////////////////////////////////////////////////////////////////////////////////////////////

 Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
-  
+
  cmdline.get_cmd_line_argument("verification-enabled", enabled, true);
  if (enabled) {
    cmdline.get_cmd_line_argument("verification-required", required, false);
@ -500,7 +560,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
  }

  if (cmdline.check_cmd_line_flag("verification-providers")) {
-    
+
    std::vector<std::string> tokens;
    cmdline.get_cmd_line_arguments("verification-providers", tokens);

@ -516,7 +576,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
  else {
    providers.push_back(library::Provider::kCUBLAS);
    providers.push_back(library::Provider::kReferenceDevice);
-    providers.push_back(library::Provider::kCUDNN);      
+    providers.push_back(library::Provider::kCUDNN);
  }
 }

@ -583,11 +643,11 @@ size_t Options::Verification::index(library::Provider provider) const {
 /////////////////////////////////////////////////////////////////////////////////////////////////

 Options::Report::Report(cutlass::CommandLine const &cmdline) {
-  
+
  cmdline.get_cmd_line_argument("append", append, false);
  cmdline.get_cmd_line_argument("output", output_path);
  cmdline.get_cmd_line_argument("junit-output", junit_output_path);
- 
+
  if (cmdline.check_cmd_line_flag("tags")) {
    cmdline.get_cmd_line_argument_pairs("tags", pivot_tags);
  }
@ -687,11 +747,11 @@ Options::Options(cutlass::CommandLine const &cmdline):
  device(cmdline),
  initialization(cmdline),
  library(cmdline),
-  profiling(cmdline), 
-  verification(cmdline), 
+  profiling(cmdline),
+  verification(cmdline),
  report(cmdline),
  about(cmdline) {
-  
+
  if (cmdline.check_cmd_line_flag("mode")) {
    std::string token;
    cmdline.get_cmd_line_argument("mode", token);
--- a/tools/profiler/src/performance_report.cpp
+++ b/tools/profiler/src/performance_report.cpp
@ -94,7 +94,7 @@ PerformanceReport::PerformanceReport(
    if (options_.report.append) {

      std::ifstream test_output_file(op_file_name_);
-      
+
      if (test_output_file.is_open()) {
        print_header = false;
        test_output_file.close();
@ -145,7 +145,7 @@ void PerformanceReport::append_result(PerformanceResult result) {

  if (options_.report.verbose) {
    std::cout << "\n";
-    print_result_pretty_(std::cout, result) << std::flush; 
+    print_result_pretty_(std::cout, result) << std::flush;
  }

  if (junit_output_file_.is_open()) {
@ -237,7 +237,7 @@ static const char *disposition_status_color(Disposition disposition) {

 /// Prints the result in human readable form
 std::ostream & PerformanceReport::print_result_pretty_(
-  std::ostream &out, 
+  std::ostream &out,
  PerformanceResult const &result,
  bool use_shell_coloring) {

@ -251,14 +251,14 @@ std::ostream & PerformanceReport::print_result_pretty_(
    int column_idx = 0;
    for (auto const & tag : options_.report.pivot_tags) {
      out << (column_idx++ ? "," : "") << tag.first << ":" << tag.second;
-    } 
+    }

    out << "\n";
  }

  std::string shell_color_bright = use_shell_coloring ? SHELL_COLOR_BRIGHT() : "";
  std::string shell_color_end = use_shell_coloring ? SHELL_COLOR_END() : "";
-  auto _disposition_status_color = [&](Disposition d) -> const char * { 
+  auto _disposition_status_color = [&](Disposition d) -> const char * {
    return use_shell_coloring ? disposition_status_color(d) : "";
  };

@ -277,7 +277,7 @@ std::ostream & PerformanceReport::print_result_pretty_(
    static int const indent_spaces = 16;

    for(auto & m : result.verification_map) {
-      out  << std::right << std::setw(indent_spaces) << library::to_string(m.first, true) << ": " << to_string(m.second, true) << "\n";  
+      out  << std::right << std::setw(indent_spaces) << library::to_string(m.first, true) << ": " << to_string(m.second, true) << "\n";
    }
  }

@ -287,7 +287,7 @@ std::ostream & PerformanceReport::print_result_pretty_(
  int column_idx = 0;
  for (auto const &arg : result.arguments) {
    if (!arg.second.empty()) {
-      out << " --" << arg.first << "=" << arg.second; 
+      out << " --" << arg.first << "=" << arg.second;
      column_idx += int(4 + arg.first.size() + arg.second.size());
      if (column_idx > 98) {
        out << "  \\\n                 ";
@ -297,7 +297,7 @@ std::ostream & PerformanceReport::print_result_pretty_(
  }
  out << "\n\n";

-  out 
+  out
    << "           Bytes: " << result.bytes << "  bytes\n"
    << "           FLOPs: " << result.flops << "  flops\n"
    << "           FLOPs/Byte: " << (result.flops / result.bytes) << "\n\n";
@ -325,7 +325,7 @@ std::ostream & PerformanceReport::print_csv_header_(
    out << (column_idx++ ? "," : "") << tag.first;
  }

-  out 
+  out
    << (column_idx ? "," : "") << "Problem,Provider"
    << ",OperationKind,Operation,Disposition,Status";

@ -333,7 +333,7 @@ std::ostream & PerformanceReport::print_csv_header_(
    out << "," << arg_name;
  }

-  out 
+  out
    << ",Bytes"
    << ",Flops"
    << ",Flops/Byte"
@ -347,7 +347,7 @@ std::ostream & PerformanceReport::print_csv_header_(

 /// Print the result in CSV output
 std::ostream & PerformanceReport::print_result_csv_(
-  std::ostream &out, 
+  std::ostream &out,
  PerformanceResult const &result) {

  int column_idx = 0;
@ -357,8 +357,8 @@ std::ostream & PerformanceReport::print_result_csv_(
    out << (column_idx++ ? "," : "") << tag.second;
  }

-  out 
-    << (column_idx ? "," : "") 
+  out
+    << (column_idx ? "," : "")
    << result.problem_index
    << "," << to_string(result.provider, true)
    << "," << to_string(result.op_kind)
@ -370,7 +370,7 @@ std::ostream & PerformanceReport::print_result_csv_(
    out << "," << arg.second;
  }

-  out 
+  out
    << "," << result.bytes
    << "," << result.flops
    << "," << result.flops / result.bytes
@ -387,7 +387,7 @@ std::ostream & PerformanceReport::print_result_csv_(
  else {
    out << std::string(2
      , ','
-    ); 
+    );
  }

  return out;
@ -451,25 +451,25 @@ std::ostream & PerformanceReport::print_junit_result_(std::ostream &out, Perform
  case Disposition::kNotSupported:
    skipped = true;
    break;
-  case Disposition::kPassed: 
+  case Disposition::kPassed:
  case Disposition::kNotVerified:
    break;
-  case Disposition::kFailed: 
+  case Disposition::kFailed:
  case Disposition::kIncorrect:
-    failed = true; 
+    failed = true;
    break;
  case Disposition::kInvalidProblem:
  case Disposition::kInvalid:
    error = true;
    break;
  };
-  
+
  if (skipped) {
    out << "status=\"notrun\"";
  } else {
    out << "status=\"run\"";
  }
-    
+
  out << ">" << std::endl;

  if (failed) {
@ -488,7 +488,7 @@ std::ostream & PerformanceReport::print_junit_result_(std::ostream &out, Perform

  out << "  </testcase>" << std::endl;

-  return out;  
+  return out;

 }

--- a/tools/profiler/src/rank_2k_operation_profiler.cu
+++ b/tools/profiler/src/rank_2k_operation_profiler.cu
@ -31,7 +31,7 @@
 /* \file
   \brief Execution environment

-  
+
 */

 #include <iostream>
@ -54,7 +54,7 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////

 /// Ctor
-Rank2KOperationProfiler::Rank2KOperationProfiler(Options const &options): 
+Rank2KOperationProfiler::Rank2KOperationProfiler(Options const &options):
  OperationProfiler(
    options,
    library::OperationKind::kRank2K,
@ -95,7 +95,7 @@ void Rank2KOperationProfiler::print_examples(std::ostream &out) const {
  out << "\nExamples:\n\n"
    << "Profile a particular problem size Syrk kernel:\n"
    << "  $ cutlass_profiler --operation=rank_2k --blas_mode=symmetric --n=1024 --k=128\n\n"
-    
+
    << "Profile a particular problem size Herk kernel:\n"
    << "  $ cutlass_profiler --operation=rank_2k --blas_mode=hermitian --n=1024 --k=128\n\n"

@ -118,7 +118,7 @@ void Rank2KOperationProfiler::print_examples(std::ostream &out) const {

    << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
    << " $ cutlass_profiler --operation=rank_2k --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
-    
+
    << "Test your changes to rank_2k kernels with a quick functional test and save results in functional-test.csv:\n"
    << " $ cutlass_profiler  --operation=rank_2k \\ \n"
    << "   --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@ -148,22 +148,22 @@ Status Rank2KOperationProfiler::RankKProblem::parse(
  library::RankKDescription const &operation_desc,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {
-  
+
  if (!arg_as_int(this->n, "n", problem_space, problem)) {
    // default value
    this->n = 1024;
  }
-  
+
  if (!arg_as_int(this->k, "k", problem_space, problem)) {
    // default value
    this->k = 1024;
  }
-  
+
  if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
    // default value
    this->split_k_slices = 1;
  }
-  
+
  if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
    // default value
    this->batch_count = 1;
@ -187,29 +187,29 @@ Status Rank2KOperationProfiler::RankKProblem::parse(
  }

  if (!arg_as_scalar(
-    this->alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    this->alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
    problem)) {

    if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
      return Status::kErrorInternal;
    }
  }
-  
+
  if (!arg_as_scalar(
-    this->beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    this->beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
    problem)) {
-    
+
    if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
      return Status::kErrorInternal;
    }
  }
-  
+
  this->lda = DeviceAllocation::get_packed_layout(
    operation_desc.A.layout, {int(this->n), int(this->k)}).front();

@ -311,14 +311,14 @@ void Rank2KOperationProfiler::RankKProblem::initialize_result(

 /// Extracts the problem dimensions
 Status Rank2KOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

-  library::RankKDescription const &operation_desc = 
+  library::RankKDescription const &operation_desc =
    static_cast<library::RankKDescription const &>(operation->description());

  if (operation_desc.rank_k_kind != library::RankKKind::kUniversal) {
@ -326,7 +326,7 @@ Status Rank2KOperationProfiler::initialize_configuration(
  }

  Status status = problem_.parse(operation_desc, problem_space, problem);
-  
+
  if (status != Status::kSuccess) {
    return status;
  }
@ -350,14 +350,14 @@ Status Rank2KOperationProfiler::initialize_configuration(
  rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;

  initialize_result_(this->model_result_, options, operation_desc, problem_space);
-  
+
  return operation->can_implement(&rank_k_workspace_.configuration, &rank_k_workspace_.arguments);
 }

 /// Initializes the performance result
 void Rank2KOperationProfiler::initialize_result_(
  PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
  library::RankKDescription const &operation_desc,
  ProblemSpace const &problem_space) {

@ -365,7 +365,7 @@ void Rank2KOperationProfiler::initialize_result_(
  result.disposition = Disposition::kNotRun;
  result.status = Status::kSuccess;
  result.operation_name = operation_desc.name;
-  
+
  problem_.initialize_result(result, operation_desc, problem_space);

  OperationProfiler::initialize_result_(result, operation_desc, problem_space);
@ -380,19 +380,30 @@ void Rank2KOperationProfiler::initialize_result_(

 /// Initializes workspace
 Status Rank2KOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {
-  
-  library::RankKDescription const &operation_desc = 
+
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
+  library::RankKDescription const &operation_desc =
    static_cast<library::RankKDescription const &>(operation->description());

  if (options.execution_mode != ExecutionMode::kDryRun) {
    int seed_shift = 0;
-    rank_k_workspace_.A = device_context.allocate_tensor(
+    rank_k_workspace_.A = device_context.allocate_and_initialize_tensor(
      options,
      "A",
      operation_desc.A.element,
@ -400,10 +411,11 @@ Status Rank2KOperationProfiler::initialize_workspace(
      {int(problem_.n), int(problem_.k)},
      {int(problem_.lda)},
      1, // batch_count
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

-    rank_k_workspace_.B = device_context.allocate_tensor(
+    rank_k_workspace_.B = device_context.allocate_and_initialize_tensor(
      options,
      "B",
      operation_desc.B.element,
@ -411,10 +423,11 @@ Status Rank2KOperationProfiler::initialize_workspace(
      {int(problem_.n), int(problem_.k)},
      {int(problem_.ldb)},
      1, // batch_count
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

-    rank_k_workspace_.C = device_context.allocate_tensor(
+    rank_k_workspace_.C = device_context.allocate_and_initialize_tensor(
      options,
      "C",
      operation_desc.C.element,
@ -422,23 +435,30 @@ Status Rank2KOperationProfiler::initialize_workspace(
      {int(problem_.n), int(problem_.n)},
      {int(problem_.ldc)},
      1, // batch_count
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

    rank_k_workspace_.Computed = device_context.allocate_tensor(
+      options,
      "D",
      operation_desc.C.element,
      operation_desc.C.layout,
      {int(problem_.n), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      0 // device_index
    );

    rank_k_workspace_.Reference = device_context.allocate_tensor(
+      options,
      "Reference",
      operation_desc.C.element,
      operation_desc.C.layout,
      {int(problem_.n), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      0 // device_index
    );

    rank_k_workspace_.Computed->copy_from_device(rank_k_workspace_.C->data());
@ -487,7 +507,7 @@ Status Rank2KOperationProfiler::initialize_workspace(

 /// Verifies CUTLASS against references
 bool Rank2KOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -516,7 +536,7 @@ bool Rank2KOperationProfiler::verify_cutlass(
  //

  results_.back().status = operation->run(
-    &rank_k_workspace_.arguments, 
+    &rank_k_workspace_.arguments,
    rank_k_workspace_.host_workspace.data(),
    rank_k_workspace_.device_workspace.data());

@ -564,8 +584,8 @@ bool Rank2KOperationProfiler::verify_cutlass(
      }
    }
 #endif // #if CUTLASS_ENABLE_CUBLAS
-    
-    // Update disposition to worst case verification outcome among all 
+
+    // Update disposition to worst case verification outcome among all
    // verification providers which are supported
    bool is_any_verification_run_passed = false;
    for(auto &m : results_.back().verification_map) {
@ -591,7 +611,7 @@ bool Rank2KOperationProfiler::verify_cutlass(

 /// Verifies CUTLASS against references
 bool Rank2KOperationProfiler::verify_with_cublas_(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -601,13 +621,13 @@ bool Rank2KOperationProfiler::verify_with_cublas_(

 #if CUTLASS_ENABLE_CUBLAS

-  library::RankKDescription const &rank_k_desc = 
+  library::RankKDescription const &rank_k_desc =
    static_cast<library::RankKDescription const &>(operation->description());

  //
  // Construct cuBLAS operators
  //
-    
+
  CublasCreate handle;
  cublasStatus_t status = handle.get_cublas_create_status();

@ -636,8 +656,8 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
    rank_k_workspace_.arguments.beta = problem_.beta.data();
    rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;

-    detail::cublasRankKDispatcher rank_k_op( 
-      rank_k_desc, 
+    detail::cublasRankKDispatcher rank_k_op(
+      rank_k_desc,
      rank_k_workspace_.configuration,
      rank_k_workspace_.arguments
    );
@ -669,7 +689,7 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
    );

    // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
      results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {

      save_workspace(
@ -694,7 +714,7 @@ bool Rank2KOperationProfiler::verify_with_cublas_(

 /// Measures performance results
 bool Rank2KOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
--- a/tools/profiler/src/rank_k_operation_profiler.cu
+++ b/tools/profiler/src/rank_k_operation_profiler.cu
@ -31,7 +31,7 @@
 /* \file
   \brief Execution environment

-  
+
 */

 #include <iostream>
@ -54,7 +54,7 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////

 /// Ctor
-RankKOperationProfiler::RankKOperationProfiler(Options const &options): 
+RankKOperationProfiler::RankKOperationProfiler(Options const &options):
  OperationProfiler(
    options,
    library::OperationKind::kRankK,
@ -94,7 +94,7 @@ void RankKOperationProfiler::print_examples(std::ostream &out) const {
  out << "\nExamples:\n\n"
    << "Profile a particular problem size Syrk kernel:\n"
    << "  $ cutlass_profiler --operation=rank_k --blas_mode=symmetric --n=1024 --k=128\n\n"
-    
+
    << "Profile a particular problem size Herk kernel:\n"
    << "  $ cutlass_profiler --operation=rank_k --blas_mode=hermitian --n=1024 --k=128\n\n"

@ -117,7 +117,7 @@ void RankKOperationProfiler::print_examples(std::ostream &out) const {

    << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
    << " $ cutlass_profiler --operation=rank_k --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
-    
+
    << "Test your changes to rank_k kernels with a quick functional test and save results in functional-test.csv:\n"
    << " $ cutlass_profiler  --operation=rank_k \\ \n"
    << "   --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@ -147,22 +147,22 @@ Status RankKOperationProfiler::RankKProblem::parse(
  library::RankKDescription const &operation_desc,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {
-  
+
  if (!arg_as_int(this->n, "n", problem_space, problem)) {
    // default value
    this->n = 1024;
  }
-  
+
  if (!arg_as_int(this->k, "k", problem_space, problem)) {
    // default value
    this->k = 1024;
  }
-  
+
  if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
    // default value
    this->split_k_slices = 1;
  }
-  
+
  if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
    // default value
    this->batch_count = 1;
@ -182,29 +182,29 @@ Status RankKOperationProfiler::RankKProblem::parse(
  }

  if (!arg_as_scalar(
-    this->alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    this->alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
    problem)) {

    if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
      return Status::kErrorInternal;
    }
  }
-  
+
  if (!arg_as_scalar(
-    this->beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    this->beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
    problem)) {
-    
+
    if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
      return Status::kErrorInternal;
    }
  }
-  
+
  this->lda = DeviceAllocation::get_packed_layout(
    operation_desc.A.layout, {int(this->n), int(this->k)}).front();

@ -252,7 +252,7 @@ int64_t RankKOperationProfiler::RankKProblem::flops(library::RankKDescription co
  case library::MathOperationID::kMultiplyAddComplexFastF32:
    flops_ *= 4;
    break;
-    
+
  case library::MathOperationID::kMultiplyAddGaussianComplex:
    flops_ *= 3;
    break;
@ -300,14 +300,14 @@ void RankKOperationProfiler::RankKProblem::initialize_result(

 /// Extracts the problem dimensions
 Status RankKOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

-  library::RankKDescription const &operation_desc = 
+  library::RankKDescription const &operation_desc =
    static_cast<library::RankKDescription const &>(operation->description());

  if (operation_desc.rank_k_kind != library::RankKKind::kUniversal) {
@ -315,7 +315,7 @@ Status RankKOperationProfiler::initialize_configuration(
  }

  Status status = problem_.parse(operation_desc, problem_space, problem);
-  
+
  if (status != Status::kSuccess) {
    return status;
  }
@ -337,14 +337,14 @@ Status RankKOperationProfiler::initialize_configuration(
  rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;

  initialize_result_(this->model_result_, options, operation_desc, problem_space);
-  
+
  return operation->can_implement(&rank_k_workspace_.configuration, &rank_k_workspace_.arguments);
 }

 /// Initializes the performance result
 void RankKOperationProfiler::initialize_result_(
  PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
  library::RankKDescription const &operation_desc,
  ProblemSpace const &problem_space) {

@ -352,7 +352,7 @@ void RankKOperationProfiler::initialize_result_(
  result.disposition = Disposition::kNotRun;
  result.status = Status::kSuccess;
  result.operation_name = operation_desc.name;
-  
+
  problem_.initialize_result(result, operation_desc, problem_space);

  OperationProfiler::initialize_result_(result, operation_desc, problem_space);
@ -368,7 +368,7 @@ void RankKOperationProfiler::initialize_result_(
  case library::MathOperationID::kMultiplyAddComplex:
    result.flops *= 4;
    break;
-     
+
  case library::MathOperationID::kMultiplyAddComplexFastF32:
    result.flops *= 4;
    break;
@ -380,19 +380,30 @@ void RankKOperationProfiler::initialize_result_(

 /// Initializes workspace
 Status RankKOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {
-  
-  library::RankKDescription const &operation_desc = 
+
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
+  library::RankKDescription const &operation_desc =
    static_cast<library::RankKDescription const &>(operation->description());

  if (options.execution_mode != ExecutionMode::kDryRun) {
    int seed_shift = 0;
-    rank_k_workspace_.A = device_context.allocate_tensor(
+    rank_k_workspace_.A = device_context.allocate_and_initialize_tensor(
      options,
      "A",
      operation_desc.A.element,
@ -400,10 +411,11 @@ Status RankKOperationProfiler::initialize_workspace(
      {int(problem_.n), int(problem_.k)},
      {int(problem_.lda)},
      1, // batch_count
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

-    rank_k_workspace_.C = device_context.allocate_tensor(
+    rank_k_workspace_.C = device_context.allocate_and_initialize_tensor(
      options,
      "C",
      operation_desc.C.element,
@ -411,23 +423,30 @@ Status RankKOperationProfiler::initialize_workspace(
      {int(problem_.n), int(problem_.n)},
      {int(problem_.ldc)},
      1, // batch_count
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

    rank_k_workspace_.Computed = device_context.allocate_tensor(
+      options,
      "D",
      operation_desc.C.element,
      operation_desc.C.layout,
      {int(problem_.n), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, //batch_count
+      0 // device_index
    );

    rank_k_workspace_.Reference = device_context.allocate_tensor(
+      options,
      "Reference",
      operation_desc.C.element,
      operation_desc.C.layout,
      {int(problem_.n), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, //batch_count
+      0 // device_index
    );

    rank_k_workspace_.Computed->copy_from_device(rank_k_workspace_.C->data());
@ -476,7 +495,7 @@ Status RankKOperationProfiler::initialize_workspace(

 /// Verifies CUTLASS against references
 bool RankKOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -504,7 +523,7 @@ bool RankKOperationProfiler::verify_cutlass(
  //

  results_.back().status = operation->run(
-    &rank_k_workspace_.arguments, 
+    &rank_k_workspace_.arguments,
    rank_k_workspace_.host_workspace.data(),
    rank_k_workspace_.device_workspace.data());

@ -552,8 +571,8 @@ bool RankKOperationProfiler::verify_cutlass(
      }
    }
 #endif // #if CUTLASS_ENABLE_CUBLAS
-    
-    // Update disposition to worst case verification outcome among all 
+
+    // Update disposition to worst case verification outcome among all
    // verification providers which are supported
    bool is_any_verification_run_passed = false;
    for(auto &m : results_.back().verification_map) {
@ -579,7 +598,7 @@ bool RankKOperationProfiler::verify_cutlass(

 /// Verifies CUTLASS against references
 bool RankKOperationProfiler::verify_with_cublas_(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -589,13 +608,13 @@ bool RankKOperationProfiler::verify_with_cublas_(

 #if CUTLASS_ENABLE_CUBLAS

-  library::RankKDescription const &rank_k_desc = 
+  library::RankKDescription const &rank_k_desc =
    static_cast<library::RankKDescription const &>(operation->description());

  //
  // Construct cuBLAS operators
  //
-    
+
  CublasCreate handle;
  cublasStatus_t status = handle.get_cublas_create_status();

@ -623,8 +642,8 @@ bool RankKOperationProfiler::verify_with_cublas_(
    rank_k_workspace_.arguments.beta = problem_.beta.data();
    rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;

-    detail::cublasRankKDispatcher rank_k_op( 
-      rank_k_desc, 
+    detail::cublasRankKDispatcher rank_k_op(
+      rank_k_desc,
      rank_k_workspace_.configuration,
      rank_k_workspace_.arguments
    );
@ -656,7 +675,7 @@ bool RankKOperationProfiler::verify_with_cublas_(
    );

    // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
      results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {

      save_workspace(
@ -681,7 +700,7 @@ bool RankKOperationProfiler::verify_with_cublas_(

 /// Measures performance results
 bool RankKOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
--- a/tools/profiler/src/sparse_gemm_operation_profiler.cu
+++ b/tools/profiler/src/sparse_gemm_operation_profiler.cu
@ -51,23 +51,23 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////

 /// Ctor
-SparseGemmOperationProfiler::SparseGemmOperationProfiler(Options const &options): 
+SparseGemmOperationProfiler::SparseGemmOperationProfiler(Options const &options):
  OperationProfiler(
    options,
    library::OperationKind::kSparseGemm,
    {
-  	  {ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (e.g. sparse, ...)"},
-  	  {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"},
-    	{ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"},
-	    {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"},
-    	{ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
-	    {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
-  	  {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
-  	  {ArgumentTypeID::kTensor, {"E"}, "Tensor storing the E operand"},
-  	  {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
-    	{ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
-	    {ArgumentTypeID::kInteger, {"split_k_slices"}, "Number of partitions of K dimension"},
-    	{ArgumentTypeID::kInteger, {"batch_count"}, "Number of GEMMs computed in one batch"},
+      {ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (e.g. sparse, ...)"},
+      {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"},
+      {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"},
+      {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"},
+      {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
+      {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
+      {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
+      {ArgumentTypeID::kTensor, {"E"}, "Tensor storing the E operand"},
+      {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
+      {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
+      {ArgumentTypeID::kInteger, {"split_k_slices"}, "Number of partitions of K dimension"},
+      {ArgumentTypeID::kInteger, {"batch_count"}, "Number of GEMMs computed in one batch"},
    }
  ) {

@ -109,7 +109,7 @@ void SparseGemmOperationProfiler::print_examples(std::ostream &out) const {

    << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
    << " $ cutlass_profiler --operation=SparseGemm --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
-    
+
    << "Test your changes to gemm kernels with a quick functional test and save results in functional-test.csv:\n"
    << " $ cutlass_profiler  --operation=SparseGemm \\ \n"
    << "   --m=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@ -125,7 +125,7 @@ Status SparseGemmOperationProfiler::SparseGemmProblem::parse(
  library::SparseGemmDescription const &operation_desc,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {
-  
+
  if (!arg_as_int(this->m, "m", problem_space, problem)) {
    // default value
    this->m = 1024;
@ -135,17 +135,17 @@ Status SparseGemmOperationProfiler::SparseGemmProblem::parse(
    // default value
    this->n = 1024;
  }
-  
+
  if (!arg_as_int(this->k, "k", problem_space, problem)) {
    // default value
    this->k = 1024;
  }
-  
+
  if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
    // default value
    this->split_k_slices = 1;
  }
-  
+
  if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
    // default value
    this->batch_count = 1;
@ -168,24 +168,24 @@ Status SparseGemmOperationProfiler::SparseGemmProblem::parse(
  }

  if (!arg_as_scalar(
-    this->alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    this->alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
    problem)) {

    if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
      return Status::kErrorInternal;
    }
  }
-  
+
  if (!arg_as_scalar(
-    this->beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    this->beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
    problem)) {
-    
+
    if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
      return Status::kErrorInternal;
    }
@ -252,14 +252,14 @@ void SparseGemmOperationProfiler::SparseGemmProblem::initialize_result(

 /// Extracts the problem dimensions
 Status SparseGemmOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

-  library::SparseGemmDescription const &operation_desc = 
+  library::SparseGemmDescription const &operation_desc =
    static_cast<library::SparseGemmDescription const &>(operation->description());

  if (operation_desc.gemm_kind != library::GemmKind::kSparse) {
@ -291,14 +291,14 @@ Status SparseGemmOperationProfiler::initialize_configuration(
  gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;

  initialize_result_(this->model_result_, options, operation_desc, problem_space);
-  
+
  return operation->can_implement(&gemm_workspace_.configuration, &gemm_workspace_.arguments);
 }

 /// Initializes the performance result
 void SparseGemmOperationProfiler::initialize_result_(
  PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
  library::SparseGemmDescription const &operation_desc,
  ProblemSpace const &problem_space) {

@ -308,7 +308,7 @@ void SparseGemmOperationProfiler::initialize_result_(
  result.operation_name = operation_desc.name;

  problem_.initialize_result(result, operation_desc, problem_space);
-  
+
  OperationProfiler::initialize_result_(result, operation_desc, problem_space);

  // Input bytes read and Output bytes written for the gemm problem
@ -337,19 +337,30 @@ void SparseGemmOperationProfiler::initialize_result_(

 /// Initializes workspace
 Status SparseGemmOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {
-  
-  library::SparseGemmDescription const &operation_desc = 
+
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
+  library::SparseGemmDescription const &operation_desc =
    static_cast<library::SparseGemmDescription const &>(operation->description());

  if (options.execution_mode != ExecutionMode::kDryRun) {
    int seed_shift = 0;
-    gemm_workspace_.A = device_context.allocate_tensor(
+    gemm_workspace_.A = device_context.allocate_and_initialize_tensor(
      options,
      "A",
      operation_desc.A.element,
@ -357,10 +368,11 @@ Status SparseGemmOperationProfiler::initialize_workspace(
      {int(problem_.m), int(problem_.k) / int(problem_.sparse)},
      {int(problem_.lda)},
      1, // batch_count
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

-    gemm_workspace_.B = device_context.allocate_tensor(
+    gemm_workspace_.B = device_context.allocate_and_initialize_tensor(
      options,
      "B",
      operation_desc.B.element,
@ -368,10 +380,11 @@ Status SparseGemmOperationProfiler::initialize_workspace(
      {int(problem_.k), int(problem_.n)},
      {int(problem_.ldb)},
      1, // batch_count
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

-    gemm_workspace_.C = device_context.allocate_tensor(
+    gemm_workspace_.C = device_context.allocate_and_initialize_tensor(
      options,
      "C",
      operation_desc.C.element,
@ -379,18 +392,22 @@ Status SparseGemmOperationProfiler::initialize_workspace(
      {int(problem_.m), int(problem_.n)},
      {int(problem_.ldc)},
      1, // batch_count
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

    gemm_workspace_.Computed = device_context.allocate_tensor(
+      options,
      "D",
      operation_desc.C.element,
      operation_desc.C.layout,
      {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      0 // device_index
    );

-    gemm_workspace_.E = device_context.allocate_sparsemeta_tensor(
+    gemm_workspace_.E = device_context.allocate_and_initialize_sparsemeta_tensor(
      options,
      "E",
      operation_desc.E.element,
@ -399,15 +416,19 @@ Status SparseGemmOperationProfiler::initialize_workspace(
      {int(problem_.m), int(problem_.k) / int(problem_.sparse) / int(problem_.elements_per_128b)},
      {int(problem_.lde)},
      1, // batch_count
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

    gemm_workspace_.Reference = device_context.allocate_tensor(
+      options,
      "Reference",
      operation_desc.C.element,
      operation_desc.C.layout,
      {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      0 // device_index
    );

    gemm_workspace_.Reference->copy_from_device(gemm_workspace_.C->data());
@ -456,7 +477,7 @@ Status SparseGemmOperationProfiler::initialize_workspace(

 /// Verifies CUTLASS against references
 bool SparseGemmOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -486,7 +507,7 @@ bool SparseGemmOperationProfiler::verify_cutlass(
  //

  results_.back().status = operation->run(
-    &gemm_workspace_.arguments, 
+    &gemm_workspace_.arguments,
    gemm_workspace_.host_workspace.data(),
    gemm_workspace_.device_workspace.data());

@ -510,7 +531,7 @@ bool SparseGemmOperationProfiler::verify_cutlass(

  if (options.verification.enabled) {

-    // Update disposition to worst case verification outcome among all 
+    // Update disposition to worst case verification outcome among all
    // verification providers which are supported
    bool is_any_verification_run_passed = false;

@ -537,7 +558,7 @@ bool SparseGemmOperationProfiler::verify_cutlass(

 /// Measures performance results
 bool SparseGemmOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -565,7 +586,7 @@ bool SparseGemmOperationProfiler::profile(
      gemm_workspace_.device_workspace.data()
    );
  }
-  
+
  return true;
 }

--- a/tools/profiler/src/symm_operation_profiler.cu
+++ b/tools/profiler/src/symm_operation_profiler.cu
@ -31,7 +31,7 @@
 /* \file
   \brief Execution environment

-  
+
 */

 #include <iostream>
@ -54,7 +54,7 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////

 /// Ctor
-SymmOperationProfiler::SymmOperationProfiler(Options const &options): 
+SymmOperationProfiler::SymmOperationProfiler(Options const &options):
  OperationProfiler(
    options,
    library::OperationKind::kSymm,
@ -96,7 +96,7 @@ void SymmOperationProfiler::print_examples(std::ostream &out) const {
  out << "\nExamples:\n\n"
    << "Profile a particular problem size SYMM kernel:\n"
    << "  $ cutlass_profiler --operation=Symm --blas_mode=symmetric --m=1024 --n=128\n\n"
-    
+
    << "Profile a particular problem size HEMM kernel:\n"
    << "  $ cutlass_profiler --operation=Symm --blas_mode=hermitian --m=1024 --n=128\n\n"

@ -122,7 +122,7 @@ void SymmOperationProfiler::print_examples(std::ostream &out) const {

    << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
    << " $ cutlass_profiler --operation=Symm --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
-    
+
    << "Test your changes to symm kernels with a quick functional test and save results in functional-test.csv:\n"
    << " $ cutlass_profiler  --operation=Symm \\ \n"
    << "   --m=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@ -152,22 +152,22 @@ Status SymmOperationProfiler::SymmProblem::parse(
  library::SymmDescription const &operation_desc,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {
-  
+
  if (!arg_as_int(this->m, "m", problem_space, problem)) {
    // default value
    this->m = 1024;
  }
-  
+
  if (!arg_as_int(this->n, "n", problem_space, problem)) {
    // default value
    this->n = 1024;
  }
-  
+
  if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
    // default value
    this->split_k_slices = 1;
  }
-  
+
  if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
    // default value
    this->batch_count = 1;
@ -191,29 +191,29 @@ Status SymmOperationProfiler::SymmProblem::parse(
  }

  if (!arg_as_scalar(
-    this->alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    this->alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
    problem)) {

    if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
      return Status::kErrorInternal;
    }
  }
-  
+
  if (!arg_as_scalar(
-    this->beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    this->beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
    problem)) {
-    
+
    if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
      return Status::kErrorInternal;
    }
  }
-  
+
  if (operation_desc.side_mode == SideMode::kLeft) {
    this->lda = DeviceAllocation::get_packed_layout(
      operation_desc.A.layout, {int(this->m), int(this->m)}).front();
@ -240,12 +240,12 @@ int64_t SymmOperationProfiler::SymmProblem::bytes(library::SymmDescription const
  if (operation_desc.side_mode == SideMode::kLeft) {
    bytes =
      int64_t(library::sizeof_bits(operation_desc.A.element) * m / 8) * (m + 1) / 2 +
-      int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n + 
+      int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n +
      int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
  } else if (operation_desc.side_mode == SideMode::kRight) {
    bytes =
      int64_t(library::sizeof_bits(operation_desc.A.element) * n / 8) * (n + 1) / 2 +
-      int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n + 
+      int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n +
      int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
  }
  // Set is_beta_zero true if beta is zero
@ -277,7 +277,7 @@ int64_t SymmOperationProfiler::SymmProblem::flops(library::SymmDescription const
  case library::MathOperationID::kMultiplyAddComplex:
    flops_ *= 4;
    break;
-    
+
  case library::MathOperationID::kMultiplyAddComplexFastF32:
    flops_ *= 4;
    break;
@ -334,14 +334,14 @@ void SymmOperationProfiler::SymmProblem::initialize_result(

 /// Extracts the problem dimensions
 Status SymmOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

-  library::SymmDescription const &operation_desc = 
+  library::SymmDescription const &operation_desc =
    static_cast<library::SymmDescription const &>(operation->description());

  if (operation_desc.symm_kind != library::SymmKind::kUniversal) {
@ -349,14 +349,14 @@ Status SymmOperationProfiler::initialize_configuration(
  }

  Status status = problem_.parse(operation_desc, problem_space, problem);
-  
+
  if (status != Status::kSuccess) {
    return status;
  }

  symm_workspace_.configuration.problem_size.m() = int(problem_.m);
  symm_workspace_.configuration.problem_size.n() = int(problem_.n);
-  symm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft) 
+  symm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft)
                                                    ? int(problem_.m) : int(problem_.n);
  symm_workspace_.configuration.lda = problem_.lda;
  symm_workspace_.configuration.ldb = problem_.ldb;
@ -374,14 +374,14 @@ Status SymmOperationProfiler::initialize_configuration(
  symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;

  initialize_result_(this->model_result_, options, operation_desc, problem_space);
-  
+
  return operation->can_implement(&symm_workspace_.configuration, &symm_workspace_.arguments);
 }

 /// Initializes the performance result
 void SymmOperationProfiler::initialize_result_(
  PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
  library::SymmDescription const &operation_desc,
  ProblemSpace const &problem_space) {

@ -389,7 +389,7 @@ void SymmOperationProfiler::initialize_result_(
  result.disposition = Disposition::kNotRun;
  result.status = Status::kSuccess;
  result.operation_name = operation_desc.name;
-  
+
  problem_.initialize_result(result, operation_desc, problem_space);

  OperationProfiler::initialize_result_(result, operation_desc, problem_space);
@ -404,20 +404,31 @@ void SymmOperationProfiler::initialize_result_(

 /// Initializes workspace
 Status SymmOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {
-  
-  library::SymmDescription const &operation_desc = 
+
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
+  library::SymmDescription const &operation_desc =
    static_cast<library::SymmDescription const &>(operation->description());

  if (options.execution_mode != ExecutionMode::kDryRun) {
    int seed_shift = 0;
    if (operation_desc.side_mode == SideMode::kLeft) {
-      symm_workspace_.A = device_context.allocate_tensor(
+      symm_workspace_.A = device_context.allocate_and_initialize_tensor(
        options,
        "A",
        operation_desc.A.element,
@ -425,10 +436,11 @@ Status SymmOperationProfiler::initialize_workspace(
        {int(problem_.m), int(problem_.m)},
        {int(problem_.lda)},
        1, // batch_count
-        seed_shift++
+        seed_shift++,
+        0 // device_index
      );
    } else if (operation_desc.side_mode == SideMode::kRight) {
-      symm_workspace_.A = device_context.allocate_tensor(
+      symm_workspace_.A = device_context.allocate_and_initialize_tensor(
        options,
        "A",
        operation_desc.A.element,
@ -436,11 +448,12 @@ Status SymmOperationProfiler::initialize_workspace(
        {int(problem_.n), int(problem_.n)},
        {int(problem_.lda)},
        1, // batch_count
-        seed_shift++
+        seed_shift++,
+        0 // device_index
      );
    }

-    symm_workspace_.B = device_context.allocate_tensor(
+    symm_workspace_.B = device_context.allocate_and_initialize_tensor(
      options,
      "B",
      operation_desc.B.element,
@ -448,10 +461,11 @@ Status SymmOperationProfiler::initialize_workspace(
      {int(problem_.m), int(problem_.n)},
      {int(problem_.ldb)},
      1, // batch_count
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

-    symm_workspace_.C = device_context.allocate_tensor(
+    symm_workspace_.C = device_context.allocate_and_initialize_tensor(
      options,
      "C",
      operation_desc.C.element,
@ -459,23 +473,30 @@ Status SymmOperationProfiler::initialize_workspace(
      {int(problem_.m), int(problem_.n)},
      {int(problem_.ldc)},
      1, // batch_count
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

    symm_workspace_.Computed = device_context.allocate_tensor(
+      options,
      "D",
      operation_desc.C.element,
      operation_desc.C.layout,
      {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      0 // device_index
    );

    symm_workspace_.Reference = device_context.allocate_tensor(
+      options,
      "Reference",
      operation_desc.C.element,
      operation_desc.C.layout,
      {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldc)}
+      {int(problem_.ldc)},
+      1, // batch_count
+      0 // device_index
    );

    symm_workspace_.Computed->copy_from_device(symm_workspace_.C->data());
@ -524,7 +545,7 @@ Status SymmOperationProfiler::initialize_workspace(

 /// Verifies CUTLASS against references
 bool SymmOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -553,7 +574,7 @@ bool SymmOperationProfiler::verify_cutlass(
  //

  results_.back().status = operation->run(
-    &symm_workspace_.arguments, 
+    &symm_workspace_.arguments,
    symm_workspace_.host_workspace.data(),
    symm_workspace_.device_workspace.data());

@ -601,8 +622,8 @@ bool SymmOperationProfiler::verify_cutlass(
      }
    }
 #endif // #if CUTLASS_ENABLE_CUBLAS
-    
-    // Update disposition to worst case verification outcome among all 
+
+    // Update disposition to worst case verification outcome among all
    // verification providers which are supported
    bool is_any_verification_run_passed = false;
    for(auto &m : results_.back().verification_map) {
@ -628,7 +649,7 @@ bool SymmOperationProfiler::verify_cutlass(

 /// Verifies CUTLASS against references
 bool SymmOperationProfiler::verify_with_cublas_(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -638,13 +659,13 @@ bool SymmOperationProfiler::verify_with_cublas_(

 #if CUTLASS_ENABLE_CUBLAS

-  library::SymmDescription const &symm_desc = 
+  library::SymmDescription const &symm_desc =
    static_cast<library::SymmDescription const &>(operation->description());

  //
  // Construct cuBLAS operators
  //
-    
+
  CublasCreate handle;
  cublasStatus_t status = handle.get_cublas_create_status();

@ -673,8 +694,8 @@ bool SymmOperationProfiler::verify_with_cublas_(
    symm_workspace_.arguments.beta = problem_.beta.data();
    symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;

-    detail::cublasSymmDispatcher symm_op( 
-      symm_desc, 
+    detail::cublasSymmDispatcher symm_op(
+      symm_desc,
      symm_workspace_.configuration,
      symm_workspace_.arguments
    );
@ -706,7 +727,7 @@ bool SymmOperationProfiler::verify_with_cublas_(
    );

    // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
      results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {

      save_workspace(
@ -731,7 +752,7 @@ bool SymmOperationProfiler::verify_with_cublas_(

 /// Measures performance results
 bool SymmOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
--- a/tools/profiler/src/trmm_operation_profiler.cu
+++ b/tools/profiler/src/trmm_operation_profiler.cu
@ -31,7 +31,7 @@
 /* \file
   \brief Execution environment

-  
+
 */

 #include <iostream>
@ -54,7 +54,7 @@ namespace profiler {
 /////////////////////////////////////////////////////////////////////////////////////////////////

 /// Ctor
-TrmmOperationProfiler::TrmmOperationProfiler(Options const &options): 
+TrmmOperationProfiler::TrmmOperationProfiler(Options const &options):
  OperationProfiler(
    options,
    library::OperationKind::kTrmm,
@ -113,7 +113,7 @@ void TrmmOperationProfiler::print_examples(std::ostream &out) const {

    << "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
    << " $ cutlass_profiler --operation=Trmm --cta_m=256 --cta_n=128  --cta_k=32 --save-workspace=incorrect\n\n"
-    
+
    << "Test your changes to trmm kernels with a quick functional test and save results in functional-test.csv:\n"
    << " $ cutlass_profiler  --operation=Trmm \\ \n"
    << "   --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@ -143,22 +143,22 @@ Status TrmmOperationProfiler::TrmmProblem::parse(
  library::TrmmDescription const &operation_desc,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {
-  
+
  if (!arg_as_int(this->m, "m", problem_space, problem)) {
    // default value
    this->m = 1024;
  }
-  
+
  if (!arg_as_int(this->n, "n", problem_space, problem)) {
    // default value
    this->n = 1024;
  }
-  
+
  if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
    // default value
    this->split_k_slices = 1;
  }
-  
+
  if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
    // default value
    this->batch_count = 1;
@ -182,29 +182,29 @@ Status TrmmOperationProfiler::TrmmProblem::parse(
  }

  if (!arg_as_scalar(
-    this->alpha, 
-    operation_desc.element_epilogue, 
-    "alpha", 
-    problem_space, 
+    this->alpha,
+    operation_desc.element_epilogue,
+    "alpha",
+    problem_space,
    problem)) {

    if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
      return Status::kErrorInternal;
    }
  }
-  
+
  if (!arg_as_scalar(
-    this->beta, 
-    operation_desc.element_epilogue, 
-    "beta", 
-    problem_space, 
+    this->beta,
+    operation_desc.element_epilogue,
+    "beta",
+    problem_space,
    problem)) {
-    
+
    if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
      return Status::kErrorInternal;
    }
  }
-  
+
  if (operation_desc.side_mode == SideMode::kLeft) {
    this->lda = DeviceAllocation::get_packed_layout(
      operation_desc.A.layout, {int(this->m), int(this->m)}).front();
@ -265,14 +265,14 @@ void TrmmOperationProfiler::TrmmProblem::initialize_result(

 /// Extracts the problem dimensions
 Status TrmmOperationProfiler::initialize_configuration(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {

-  library::TrmmDescription const &operation_desc = 
+  library::TrmmDescription const &operation_desc =
    static_cast<library::TrmmDescription const &>(operation->description());

  if (operation_desc.trmm_kind != library::TrmmKind::kUniversal) {
@ -280,14 +280,14 @@ Status TrmmOperationProfiler::initialize_configuration(
  }

  Status status = problem_.parse(operation_desc, problem_space, problem);
-  
+
  if (status != Status::kSuccess) {
    return status;
  }

  trmm_workspace_.configuration.problem_size.m() = int(problem_.m);
  trmm_workspace_.configuration.problem_size.n() = int(problem_.n);
-  trmm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft) 
+  trmm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft)
                                                    ? int(problem_.m) : int(problem_.n);
  trmm_workspace_.configuration.lda = problem_.lda;
  trmm_workspace_.configuration.ldb = problem_.ldb;
@ -303,14 +303,14 @@ Status TrmmOperationProfiler::initialize_configuration(
  trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;

  initialize_result_(this->model_result_, options, operation_desc, problem_space);
-  
+
  return operation->can_implement(&trmm_workspace_.configuration, &trmm_workspace_.arguments);
 }

 /// Initializes the performance result
 void TrmmOperationProfiler::initialize_result_(
  PerformanceResult &result,
-  Options const &options,  
+  Options const &options,
  library::TrmmDescription const &operation_desc,
  ProblemSpace const &problem_space) {

@ -318,30 +318,30 @@ void TrmmOperationProfiler::initialize_result_(
  result.disposition = Disposition::kNotRun;
  result.status = Status::kSuccess;
  result.operation_name = operation_desc.name;
-  
+
  problem_.initialize_result(result, operation_desc, problem_space);

  OperationProfiler::initialize_result_(result, operation_desc, problem_space);

  if (operation_desc.side_mode == SideMode::kLeft) {
    // Input bytes read and Output bytes written for the trmm problem
-    result.bytes = 
+    result.bytes =
      // Half matrix including the diagonal will have (M*(M+1))/2 elements
      int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.m / 8) * (problem_.m + 1) / 2 +
-      int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n + 
+      int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n +
      int64_t(library::sizeof_bits(operation_desc.D.element) * problem_.m / 8) * problem_.n;
  } else if (operation_desc.side_mode == SideMode::kRight) {
    // Input bytes read and Output bytes written for the trmm problem
-    result.bytes = 
+    result.bytes =
      // Half matrix including the diagonal will have (N*(N+1))/2 elements
      int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.n / 8) * (problem_.n + 1) / 2 +
-      int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n + 
+      int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n +
      int64_t(library::sizeof_bits(operation_desc.D.element) * problem_.m / 8) * problem_.n;
  }

  // FLOPs = 2 * [ ( M * (M+1)/2 * N ) ] // Beta is zero
  result.flops = problem_.m * (problem_.m + 1) * problem_.n;
- 
+
   result.runtime = 0;

  // complex-valued support
@ -349,11 +349,11 @@ void TrmmOperationProfiler::initialize_result_(
  case library::MathOperationID::kMultiplyAddComplex:
    result.flops *= 4;
    break;
-    
+
  case library::MathOperationID::kMultiplyAddComplexFastF32:
    result.flops *= 4;
    break;
- 
+
  default: break;
  }

@ -361,20 +361,31 @@ void TrmmOperationProfiler::initialize_result_(

 /// Initializes workspace
 Status TrmmOperationProfiler::initialize_workspace(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
  ProblemSpace const &problem_space,
  ProblemSpace::Problem const &problem) {
-  
-  library::TrmmDescription const &operation_desc = 
+
+  if (options.device.devices.size() != 1) {
+    throw std::runtime_error("This operation profiler only supports a single "
+                             "device.");
+  }
+
+  cudaError_t result;
+  result = cudaSetDevice(options.device.device_id(0));
+  if (result != cudaSuccess) {
+    throw std::runtime_error("cudaSetDevice() failed.");
+  }
+
+  library::TrmmDescription const &operation_desc =
    static_cast<library::TrmmDescription const &>(operation->description());

  if (options.execution_mode != ExecutionMode::kDryRun) {
    int seed_shift = 0;
    if (operation_desc.side_mode == SideMode::kLeft) {
-      trmm_workspace_.A = device_context.allocate_tensor(
+      trmm_workspace_.A = device_context.allocate_and_initialize_tensor(
        options,
        "A",
        operation_desc.A.element,
@ -382,10 +393,11 @@ Status TrmmOperationProfiler::initialize_workspace(
        {int(problem_.m), int(problem_.m)},
        {int(problem_.lda)},
        1, // batch_count
-        seed_shift++
+        seed_shift++,
+        0 // device_index
      );
    } else if (operation_desc.side_mode == SideMode::kRight) {
-      trmm_workspace_.A = device_context.allocate_tensor(
+      trmm_workspace_.A = device_context.allocate_and_initialize_tensor(
        options,
        "A",
        operation_desc.A.element,
@ -393,11 +405,12 @@ Status TrmmOperationProfiler::initialize_workspace(
        {int(problem_.n), int(problem_.n)},
        {int(problem_.lda)},
        1, // batch_count
-        seed_shift++
+        seed_shift++,
+        0 // device_index
      );
    }

-    trmm_workspace_.B = device_context.allocate_tensor(
+    trmm_workspace_.B = device_context.allocate_and_initialize_tensor(
      options,
      "B",
      operation_desc.B.element,
@ -405,23 +418,30 @@ Status TrmmOperationProfiler::initialize_workspace(
      {int(problem_.m), int(problem_.n)},
      {int(problem_.ldb)},
      1, // batch_count
-      seed_shift++
+      seed_shift++,
+      0 // device_index
    );

    trmm_workspace_.Computed = device_context.allocate_tensor(
+      options,
      "D",
      operation_desc.D.element,
      operation_desc.D.layout,
      {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldd)}
+      {int(problem_.ldd)},
+      1, // batch_count
+      0 // device_index
    );

    trmm_workspace_.Reference = device_context.allocate_tensor(
+      options,
      "Reference",
      operation_desc.D.element,
      operation_desc.D.layout,
      {int(problem_.m), int(problem_.n)},
-      {int(problem_.ldd)}
+      {int(problem_.ldd)},
+      1, // batch_count
+      0 // device_index
    );

  }
@ -467,7 +487,7 @@ Status TrmmOperationProfiler::initialize_workspace(

 /// Verifies CUTLASS against references
 bool TrmmOperationProfiler::verify_cutlass(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -495,7 +515,7 @@ bool TrmmOperationProfiler::verify_cutlass(
  //

  results_.back().status = operation->run(
-    &trmm_workspace_.arguments, 
+    &trmm_workspace_.arguments,
    trmm_workspace_.host_workspace.data(),
    trmm_workspace_.device_workspace.data());

@ -543,8 +563,8 @@ bool TrmmOperationProfiler::verify_cutlass(
      }
    }
 #endif // #if CUTLASS_ENABLE_CUBLAS
-    
-    // Update disposition to worst case verification outcome among all 
+
+    // Update disposition to worst case verification outcome among all
    // verification providers which are supported
    bool is_any_verification_run_passed = false;
    for(auto &m : results_.back().verification_map) {
@ -570,7 +590,7 @@ bool TrmmOperationProfiler::verify_cutlass(

 /// Verifies CUTLASS against references
 bool TrmmOperationProfiler::verify_with_cublas_(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
@ -580,13 +600,13 @@ bool TrmmOperationProfiler::verify_with_cublas_(

 #if CUTLASS_ENABLE_CUBLAS

-  library::TrmmDescription const &trmm_desc = 
+  library::TrmmDescription const &trmm_desc =
    static_cast<library::TrmmDescription const &>(operation->description());

  //
  // Construct cuBLAS operators
  //
-    
+
  CublasCreate handle;
  cublasStatus_t status = handle.get_cublas_create_status();

@ -614,8 +634,8 @@ bool TrmmOperationProfiler::verify_with_cublas_(
    trmm_workspace_.arguments.beta = problem_.beta.data();
    trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;

-    detail::cublasTrmmDispatcher trmm_op( 
-      trmm_desc, 
+    detail::cublasTrmmDispatcher trmm_op(
+      trmm_desc,
      trmm_workspace_.configuration,
      trmm_workspace_.arguments
    );
@ -646,7 +666,7 @@ bool TrmmOperationProfiler::verify_with_cublas_(
    );

    // Save workspace if incorrect
-    if (options.verification.save_workspace == SaveWorkspace::kIncorrect && 
+    if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
      results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {

      save_workspace(
@ -671,7 +691,7 @@ bool TrmmOperationProfiler::verify_with_cublas_(

 /// Measures performance results
 bool TrmmOperationProfiler::profile(
-  Options const &options,  
+  Options const &options,
  PerformanceReport &report,
  DeviceContext &device_context,
  library::Operation const *operation,
--- a/tools/util/include/cutlass/util/device_memory.h
+++ b/tools/util/include/cutlass/util/device_memory.h
@ -37,9 +37,11 @@
 */

 #include <memory>
+#include <sstream>

 #include "cutlass/platform/platform.h"
 #include "cutlass/numeric_types.h"
+#include "cutlass/trace.h"
 #include "exceptions.h"

 namespace cutlass {
@ -61,8 +63,20 @@ T* allocate(size_t count = 1) {
  cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);

  if (cuda_error != cudaSuccess) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 0)
+    std::ostringstream os;
+    os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes=" << bytes;
+    CUTLASS_TRACE_HOST(os.str());
+#endif
    throw cuda_exception("Failed to allocate memory", cuda_error);
  }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+  else {
+    std::ostringstream os;
+    os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes=" << bytes;
+    CUTLASS_TRACE_HOST(os.str());
+  }
+#endif

  return ptr;
 }
@ -85,11 +99,36 @@ void free(T* ptr) {
 template <typename T>
 void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
  size_t bytes = count * sizeof_bits<T>::value / 8;
-  if (bytes == 0 && count > 0)
+  if (bytes == 0 && count > 0) {
    bytes = 1;
+  }
  cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
  if (cuda_error != cudaSuccess) {
-    throw cuda_exception("cudaMemcpy() failed", cuda_error);
+    std::ostringstream os;
+    os << "cutlass::device_memory::copy: cudaMemcpy() failed: "
+       << "dst=" << dst << ", src=" << src
+       << ", bytes=" << bytes << ", count=" << count;
+    if (kind == cudaMemcpyHostToDevice) {
+      os << ", kind=cudaMemcpyHostToDevice";
+    }
+    else if (kind == cudaMemcpyDeviceToHost) {
+      os << ", kind=cudaMemcpyDeviceToHost";
+    }
+    else if (kind == cudaMemcpyDeviceToDevice) {
+      os << ", kind=cudaMemcpyDeviceToDevice";
+    }
+    else if (kind == cudaMemcpyHostToHost) {
+      os << ", kind=cudaMemcpyHostToHost";
+    }
+    else if (kind == cudaMemcpyDefault) {
+      os << ", kind=cudaMemcpyDefault";
+    }
+    else {
+      os << ", kind=Unknown";
+    }
+    os << ", error: " << cudaGetErrorString(cuda_error);
+
+    throw cuda_exception(os.str().c_str(), cuda_error);
  }
 }

--- a/tools/util/include/cutlass/util/distribution.h
+++ b/tools/util/include/cutlass/util/distribution.h
@ -51,6 +51,8 @@ struct Distribution {
    struct {
      double min;
      double max;
+      // Percent elements set to NaN
+      double pnan;
    } uniform;

    /// Gaussian distribution
@ -82,17 +84,18 @@ struct Distribution {

  Distribution() : kind(Invalid), int_scale(0) {}

-  /// Configures distribution as uniform random
-  Distribution &set_uniform(double _min, double _max, int _int_scale = 0) {
+/// Configures distribution as uniform random
+  Distribution &set_uniform(double _min, double _max, int _int_scale = 0, double _pnan = 0) {
    kind = Uniform;
    uniform.min = _min;
    uniform.max = _max;
    int_scale = _int_scale;
+    uniform.pnan = _pnan;
    return *this;
  }

  /// Configures distribution as Gaussian distribution
-  Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0, double _pnz = 100.0) {
+  Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0, double _pnz = 1.0) {
    kind = Gaussian;
    gaussian.mean = _mean;
    gaussian.stddev = _stddev;
@ -125,7 +128,8 @@ struct Distribution {
 inline std::ostream &operator<<(std::ostream &out, cutlass::Distribution const &dist) {
  switch (dist.kind) {
    case cutlass::Distribution::Uniform:
-      out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max;
+      out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max
+          << ", pnan: " << dist.uniform.pnan;
      break;
    case cutlass::Distribution::Gaussian:
      out << "gaussian, mean: " << dist.gaussian.mean << ", stddev: " << dist.gaussian.stddev
--- a/tools/util/include/cutlass/util/host_tensor.h
+++ b/tools/util/include/cutlass/util/host_tensor.h
@ -177,16 +177,25 @@ public:
  void reserve(
    size_t count,                                        ///< size of tensor in elements
    bool device_backed_ = true) {                        ///< if true, device memory is also allocated
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count=" << count << ", device_backed_=" << (device_backed_ ? "true" : "false") << ")");
+#endif

    device_.reset();
    host_.clear();

    size_t count_container = count_to_container_storage_unit_count(count);
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize(" << count_container << ")");
+#endif    
    host_.resize(count_container);

    // Allocate memory
    StorageUnit* device_memory = nullptr;
    if (device_backed_) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: device_memory::allocate(" << count_container << ")");
+#endif
      device_memory = device_memory::allocate<StorageUnit>(count_container);
    }
    device_.reset(device_memory, device_backed_ ? count_container : 0);
@ -394,7 +403,7 @@ public:
  void sync_device() {
    if (device_backed()) {
      device_memory::copy_to_device(
-          device_.get(), host_.data(), host_.capacity());
+          device_.get(), host_.data(), host_.size());
    }
  }

--- a/tools/util/include/cutlass/util/packed_stride.hpp
+++ b/tools/util/include/cutlass/util/packed_stride.hpp
@ -35,6 +35,8 @@
 #pragma once

 #include "cute/layout.hpp"
+#include "cute/container/array.hpp"   // cute::array
+#include "cutlass/conv/convolution.h" // cutlass::conv::Operator

 /////////////////////////////////////////////////////////////////////////////////////////////////

--- a/tools/util/include/cutlass/util/reference/device/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_fill.h
@ -57,6 +57,7 @@
 #include "cutlass/complex.h"
 #include "cutlass/tensor_view.h"
 #include "cutlass/blas3.h"
+#include "cutlass/numeric_types.h"

 #include "cutlass/layout/vector.h"

@ -117,6 +118,7 @@ struct RandomGaussianFunc {
    int int_scale;
    FloatType float_scale_up;
    FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros

    //
    // Methods
@ -127,12 +129,14 @@ struct RandomGaussianFunc {
      uint64_t seed_ = 0,
      Element mean_ = 0, 
      Element stddev_ = 1,
-      int int_scale_ = -1
+      int int_scale_ = -1,
+      int exclude_zero_ = -1
    ):
      seed(seed_), 
      mean(static_cast<FloatType>(mean_)), 
      stddev(static_cast<FloatType>(stddev_)), 
-      int_scale(int_scale_) {
+      int_scale(int_scale_),
+      exclude_zero(exclude_zero_) {

      float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
      float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
@ -178,6 +182,15 @@ struct RandomGaussianFunc {
      result = Element(rnd);
    }

+    if (params.exclude_zero >=0 && result == Element(0.0)) {
+      if (rnd > FloatType(0)) {
+        rnd += FloatType(1);
+      } else {
+        rnd -= FloatType(1);
+      }
+      result = Element(rnd);
+    }
+
    return result;
  }
 };
@ -203,6 +216,7 @@ struct RandomGaussianFunc<complex<Real>> {
    int int_scale;
    FloatType float_scale_up;
    FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros

    //
    // Methods
@ -213,12 +227,14 @@ struct RandomGaussianFunc<complex<Real>> {
      uint64_t seed_ = 0,
      Real mean_ = 0, 
      Real stddev_ = 1,
-      int int_scale_ = -1
+      int int_scale_ = -1,
+      int exclude_zero_ = -1
    ):
      seed(seed_), 
      mean(static_cast<FloatType>(mean_)), 
      stddev(static_cast<FloatType>(stddev_)), 
-      int_scale(int_scale_) {
+      int_scale(int_scale_),
+      exclude_zero(exclude_zero_) {

      float_scale_up = FloatType(IntType(1) << int_scale);
      float_scale_up += FloatType(0.5) * float_scale_up;
@ -272,6 +288,18 @@ struct RandomGaussianFunc<complex<Real>> {
      result = Element(Real(rnd_r), Real(rnd_i));
    }

+    if (params.exclude_zero >= 0 && 
+        result.real() == Real(0.0) &&
+        result.imag() == Real(0.0)) {
+
+      if (rnd_r > FloatType(0)) {
+        rnd_r += FloatType(1);
+      } else {
+        rnd_r -= FloatType(1);
+      }
+      result = Element(Real(rnd_r), Real(rnd_i));
+    }
+
    return result;
  }
 };
@ -358,6 +386,7 @@ void TensorFillRandomGaussian(
  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
                                          ///  are not truncated to zero. Permits reducing precision of
                                          ///  data.
+  int exclude_zero = -1,                  ///< If non-negative, excludes zeros from tensor init
  cudaStream_t stream = nullptr) {

  using RandomFunc = detail::RandomGaussianFunc<Element>;
@ -366,7 +395,7 @@ void TensorFillRandomGaussian(

  TensorForEach<Func, Layout::kRank, Params>(
    view.extent(),
-    Params(view, typename RandomFunc::Params(seed, mean, stddev, bits)),
+    Params(view, typename RandomFunc::Params(seed, mean, stddev, bits, exclude_zero)),
    /*grid_size*/0, /*block_size*/0,
    stream
  );
@ -399,7 +428,7 @@ void BlockFillRandomGaussian(

 namespace detail {

-/// Computes a random Gaussian distribution
+/// Computes a random uniform distribution
 template <typename Element>                ///< Element type 
 struct RandomUniformFunc {

@ -424,8 +453,10 @@ struct RandomUniformFunc {
    FloatType range;
    FloatType max;
    int int_scale;
+    double pnan;
    FloatType float_scale_up;
    FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros

    /// Default ctor
    CUTLASS_HOST_DEVICE
@ -440,15 +471,25 @@ struct RandomUniformFunc {
      uint64_t seed_ = 0, 
      Element max_ = 1,
      Element min = 0,
-      int int_scale_ = -1
+      int int_scale_ = -1,
+      double pnan_ = 0,
+      int exclude_zero_ = -1
    ):
      seed(seed_), 
      range(static_cast<FloatType>(max_) - static_cast<FloatType>(min)), 
      max(static_cast<FloatType>(max_)),
-      int_scale(int_scale_) {
+      int_scale(int_scale_),
+      pnan(pnan_),
+      exclude_zero(exclude_zero_) {
      
      float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
      float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
+
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero >= 0) {
+        range = (min == Element(0)) ? range - FloatType(1): range;
+        max = (max_ == Element(0)) ? max - FloatType(1): max; 
+      }
    }
  };

@ -479,6 +520,13 @@ struct RandomUniformFunc {
  CUTLASS_DEVICE
  Element operator()() {

+    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
+        return Element(NAN);
+      }
+    }
+
    FloatType rnd = random_uniform_float<FloatType>(&rng_state);
    rnd = params.max - params.range * rnd;

@ -494,6 +542,15 @@ struct RandomUniformFunc {
      result = Element(rnd);
    }

+    if (params.exclude_zero >=0 && result == Element(0.0)) {
+      if (rnd > FloatType(0)) {
+        rnd = std::min(params.max, rnd + FloatType(1));
+      } else {
+        rnd = std::max((params.max - params.range), rnd - FloatType(1));
+      }
+      result = Element(rnd);
+    }
+
    return result;
  }
 };
@ -525,8 +582,10 @@ struct RandomUniformFunc<complex<Real>> {
    FloatType range;
    FloatType min;
    int int_scale;
+    double pnan;
    FloatType float_scale_up;
    FloatType float_scale_down;
+    int exclude_zero;           ///< If non-negative, excludes zeros

    /// Default ctor
    CUTLASS_HOST_DEVICE
@ -541,16 +600,26 @@ struct RandomUniformFunc<complex<Real>> {
      uint64_t seed_ = 0, 
      FloatType max = 1,
      FloatType min_ = 0,
-      int int_scale_ = -1
+      int int_scale_ = -1,
+      double pnan_ = 0,
+      int exclude_zero_ = -1
    ):
      seed(seed_), 
      range(static_cast<FloatType>(max - min_)), 
      min(static_cast<FloatType>(min_)), 
-      int_scale(int_scale_) {
+      int_scale(int_scale_),
+      pnan(pnan_),
+      exclude_zero(exclude_zero_) {

      float_scale_up = FloatType(IntType(1) << int_scale);
      float_scale_up += FloatType(0.5) * float_scale_up;
      float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
+
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero >= 0) {
+        min = (min == FloatType(0)) ? min + FloatType(1): min;
+        range = (max == FloatType(0)) ? range - FloatType(1): range; 
+      }
    }
  };

@ -581,6 +650,13 @@ struct RandomUniformFunc<complex<Real>> {
  CUTLASS_DEVICE
  Element operator()() {

+    // Draw random float in [0.0, 1.0] to determine if element should be NaN.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
+        return Element(Real(NAN), Real(NAN));
+      }
+    }
+
    FloatType rnd_r = random_uniform_float<FloatType>(&rng_state);
    FloatType rnd_i = random_uniform_float<FloatType>(&rng_state);

@ -604,11 +680,23 @@ struct RandomUniformFunc<complex<Real>> {
      result = Element(Real(rnd_r), Real(rnd_i));
    }

+    if (params.exclude_zero >= 0 && 
+        result.real() == Real(0.0) &&
+        result.imag() == Real(0.0)) {
+
+      if (rnd_r > FloatType(0)) {
+        rnd_r = std::min(params.min + params.range, rnd_r + FloatType(1));
+      } else {
+        rnd_r = std::max((params.min), rnd_r - FloatType(1));
+      }
+      result = Element(Real(rnd_r), Real(rnd_i));
+    }
+
    return result;
  }
 };

-/// Computes a random Gaussian distribution
+/// Computes a random uniform distribution
 template <
  typename Element,               ///< Element type
  typename Layout>                ///< Layout function
@ -693,13 +781,15 @@ void TensorFillRandomUniform(
  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
                                          ///  are not truncated to zero. Permits reducing precision of
                                          ///  data.
+  double pnan = 0,                        ///< Percentage of NaN elements.
+  int exclude_zero = -1,               ///< If non-negative, excludes zeros from tensor init
  cudaStream_t stream = nullptr) {

  using RandomFunc = detail::RandomUniformFunc<Element>;
  using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
  using Params = typename Func::Params;

-  typename RandomFunc::Params random(seed, max, min, bits);
+  typename RandomFunc::Params random(seed, max, min, bits, pnan, exclude_zero);

  TensorForEach<Func, Layout::kRank, Params>(
    view.extent(),
@ -722,11 +812,12 @@ void BlockFillRandomUniform(
  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that
                                          ///  are not truncated to zero. Permits reducing precision of
                                          ///  data.
+  double pnan = 0,                        ///< Percentage of NaN elements.
  cudaStream_t stream = nullptr) {

  using RandomFunc = detail::RandomUniformFunc<Element>;

-  typename RandomFunc::Params params(seed, max, min, bits);
+  typename RandomFunc::Params params(seed, max, min, bits, pnan);

  BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
 }
@ -1672,7 +1763,11 @@ void TensorFillRandom(
  TensorView<Element, Layout> view,       ///< destination tensor
  uint64_t seed,
  Distribution dist,
-  cudaStream_t stream = nullptr) {
+  cudaStream_t stream = nullptr,
+  int exclude_zero = -1                   ///< If non-negative, excludes 0.
+                                          ///  Note that setting this flag will result in more 1's,
+                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
+  ) {

  using Real = typename RealType<Element>::Type;

@ -1683,6 +1778,7 @@ void TensorFillRandom(
      static_cast<Real>(dist.gaussian.mean),
      static_cast<Real>(dist.gaussian.stddev),
      dist.int_scale,
+      exclude_zero,
      stream);
  } else if (dist.kind == Distribution::Uniform) {
    TensorFillRandomUniform<Element, Layout>(
@ -1691,6 +1787,8 @@ void TensorFillRandom(
      static_cast<Real>(dist.uniform.max),
      static_cast<Real>(dist.uniform.min),
      dist.int_scale,
+      dist.uniform.pnan,
+      exclude_zero,
      stream);
  }
 }
@ -1753,6 +1851,7 @@ void BlockFillRandom(
      static_cast<Real>(dist.uniform.max),
      static_cast<Real>(dist.uniform.min),
      dist.int_scale,
+      dist.uniform.pnan,
      stream);
  }
 }
--- a/tools/util/include/cutlass/util/reference/host/conv.hpp
+++ b/tools/util/include/cutlass/util/reference/host/conv.hpp
@ -128,7 +128,8 @@ template<
  class EpilogueFusionParams
 >
 struct ConvReferenceImpl {
-  using ElementAcc = typename EpilogueFusionParams::ElementAcc;
+  // Hard code accumlulator type to float to avoid data lost in accumulating add.
+  using ElementAcc = cutlass::platform::conditional_t<cutlass::platform::is_same_v<typename EpilogueFusionParams::ElementAcc, double>, double, float>;
  using ElementC = typename EpilogueFusionParams::ElementC;
  using ElementOut = typename EpilogueFusionParams::ElementOut;
  using ElementScalar = typename EpilogueFusionParams::ElementScalar;
--- a/tools/util/include/cutlass/util/reference/host/gett.hpp
+++ b/tools/util/include/cutlass/util/reference/host/gett.hpp
@ -342,7 +342,8 @@ void gett_epilogue(
        ElementCompute converted_acc = accumulator_converter(acc[m_b][n_b]);
        // per-row alpha
        if (raw_pointer_cast(epilogue_params.Valpha.data())) {
-          converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b));
+          converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b, n + n_b, l));
+          converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
        }
        ElementCompute output = mul(converted_alpha, converted_acc);

@ -355,7 +356,8 @@ void gett_epilogue(
          ElementCompute converted_src = source_converter(epilogue_params.C(m + m_b, n + n_b, l));
          // per-row beta
          if (epilogue_params.Vbeta.data()) {
-            converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b));
+            converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b, n + n_b, l));
+            converted_beta = mul(converted_beta, converted_scale_c);
          }
          output = epilogue_fma(converted_beta, converted_src, output);
        }
--- a/tools/util/include/cutlass/util/reference/host/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.h
@ -159,6 +159,7 @@ struct RandomGaussianFunc {
  int int_scale;
  double pi;
  double pnz;
+  bool exclude_zero;

  //
  // Methods
@ -168,9 +169,10 @@ struct RandomGaussianFunc {
    double mean_ = 0, 
    double stddev_ = 1,
    int int_scale_ = -1,
-    double pnz_ = 100.0
+    double pnz_ = 1.0,
+    bool exclude_zero_ = false
  ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_) {
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
      std::srand((unsigned)seed);
  }

@ -191,7 +193,7 @@ struct RandomGaussianFunc {
    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
    std::random_device rnd_device;
    std::mt19937 bernoulli_rnd(rnd_device());
-    std::bernoulli_distribution bernoulli_dist(pnz / 100);
+    std::bernoulli_distribution bernoulli_dist(pnz);
    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);

    // Sample from the Gaussian distribution for a nonzero element
@ -208,6 +210,16 @@ struct RandomGaussianFunc {
      result = static_cast<Element>(0);
    }

+    // Note that exclude_zero = true will disable the bernoulli_result above by unsetting zeros
+    if (exclude_zero && result == Element(0)) {
+      if (rnd > 0) {
+        rnd += 1;
+      } else {
+        rnd -= 1;
+      }
+      result = Element(rnd);
+    }    
+
    return result;
  }
 };
@ -222,6 +234,7 @@ struct RandomGaussianFunc<complex<Element> > {
  int int_scale;
  double pi;
  double pnz;
+  bool exclude_zero;

  //
  // Methods
@ -231,9 +244,10 @@ struct RandomGaussianFunc<complex<Element> > {
    double mean_ = 0, 
    double stddev_ = 1,
    int int_scale_ = -1,
-    double pnz_ = 100.0
+    double pnz_ = 1.0,
+    bool exclude_zero_ = false
  ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_) {
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
      std::srand((unsigned)seed);
  }

@ -249,7 +263,7 @@ struct RandomGaussianFunc<complex<Element> > {
    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
    std::random_device rnd_device;
    std::mt19937 bernoulli_rnd(rnd_device());
-    std::bernoulli_distribution bernoulli_dist(pnz / 100);
+    std::bernoulli_distribution bernoulli_dist(pnz);
    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);

    // Sample from the Gaussian distribution for a nonzero element
@ -270,6 +284,19 @@ struct RandomGaussianFunc<complex<Element> > {
      reals[1] = from_real<Element>(0);
    }

+    // Note that this will invalidate the above else statement because it unsets zero elements
+    if (exclude_zero &&
+        reals[0] == from_real<Element>(0.0) &&
+        reals[1] == from_real<Element>(0.0)) {
+
+      if (rnd[0] > 0.0) {
+        rnd[0] += 1.0;
+      } else {
+        rnd[0] -= 1.0;
+      }
+      reals[0] = from_real<Element>(rnd[0]);
+    }
+
    return complex<Element>(reals[0], reals[1]);
  }
 };
@ -284,6 +311,7 @@ struct RandomGaussianFunc<Quaternion<Element> > {
  int int_scale;
  double pi;
  double pnz;
+  bool exclude_zero;

  //
  // Methods
@ -293,9 +321,10 @@ struct RandomGaussianFunc<Quaternion<Element> > {
    double mean_ = 0,
    double stddev_ = 1,
    int int_scale_ = -1,
-    double pnz_ = 100.0
+    double pnz_ = 1.0,
+    bool exclude_zero_ = false
  ):
-    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_) {
+    seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
      std::srand((unsigned)seed);
  }

@ -313,7 +342,7 @@ struct RandomGaussianFunc<Quaternion<Element> > {
    // Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
    std::random_device rnd_device;
    std::mt19937 bernoulli_rnd(rnd_device());
-    std::bernoulli_distribution bernoulli_dist(pnz / 100);
+    std::bernoulli_distribution bernoulli_dist(pnz);
    bool bernoulli_result = bernoulli_dist(bernoulli_rnd);

    // Sample from the Gaussian distribution for a nonzero element
@ -343,6 +372,21 @@ struct RandomGaussianFunc<Quaternion<Element> > {
      reals[3] = from_real<Element>(0);
    }

+    // Note that this will invalidate the above else statement because it unsets zero elements
+    if (exclude_zero &&
+        reals[0] == from_real<Element>(0) &&
+        reals[1] == from_real<Element>(0) &&
+        reals[2] == from_real<Element>(0) &&
+        reals[3] == from_real<Element>(0)) {
+
+      if (rnd1[0] > 0.0) {
+        rnd1[0] += 1.0;
+      } else {
+        rnd1[0] -= 1.0;
+      }
+      reals[0] = from_real<Element>(rnd1[0]);
+    }
+
    return Quaternion<Element>(reals[0], reals[1], reals[2], reals[3]);
  }
 };
@ -440,10 +484,11 @@ void TensorFillRandomGaussian(
  double mean = 0,                        ///< Gaussian distribution's mean
  double stddev = 1,                      ///< Gaussian distribution's standard deviation
  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-  double pnz = 100.0) {                   ///  are not truncated to zero. Permits reducing precision of
+  double pnz = 1.0,                     ///  are not truncated to zero. Permits reducing precision of
                                          ///  data.
+  bool exclude_zero = false) {            ///< Exclude zeros from tensor init.
  
-  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
+  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz, exclude_zero);

  detail::TensorFillGaussianFunc<Element, Layout> func(
    dst,
@ -466,8 +511,9 @@ void TensorFillRandomGaussian(
  double mean = 0,                                      ///< Gaussian distribution's mean
  double stddev = 1,                                    ///< Gaussian distribution's standard deviation
  int bits = -1,                                        ///< If non-negative, specifies number of fractional bits that 
-  double pnz = 100.0) {                                 ///  are not truncated to zero. Permits reducing precision of
+  double pnz = 1.0,                                   ///  are not truncated to zero. Permits reducing precision of
                                                        ///  data.
+  bool exclude_zero = false) {                          ///< Exclude zeros from tensor init.
  
  TensorFillRandomGaussian(dst.view_real(), seed, mean, stddev, bits, pnz);
  TensorFillRandomGaussian(dst.view_imag(), ~seed, mean, stddev, bits, pnz);
@ -485,7 +531,7 @@ void TensorFillSymmetricRandomGaussian(
  double mean = 0,                        ///< Gaussian distribution's mean
  double stddev = 1,                      ///< Gaussian distribution's standard deviation
  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-  double pnz = 100.0) {                   ///  are not truncated to zero. Permits reducing precision of
+  double pnz = 1.0) {                   ///  are not truncated to zero. Permits reducing precision of
                                          ///  data.

  detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
@ -515,7 +561,7 @@ void BlockFillRandomGaussian(
  double mean = 0,                        ///< Gaussian distribution's mean
  double stddev = 1,                      ///< Gaussian distribution's standard deviation
  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
-  double pnz = 100.0) {                   ///  are not truncated to zero. Permits reducing precision of
+  double pnz = 1.0) {                   ///  are not truncated to zero. Permits reducing precision of
                                          ///  data.
  

@ -542,23 +588,47 @@ struct RandomUniformFunc {
  double min;
  int int_scale;

-  //
-  // Methods
-  //
+  double pnan;
+private:
+  using engine_type = std::mt19937;
+public:
+  engine_type bernoulli_rnd;
+  std::bernoulli_distribution bernoulli_dist;
+
+  bool exclude_zero;

  RandomUniformFunc(
    uint64_t seed_ = 0, 
    double max = 1,
    double min_ = 0,
-    int int_scale_ = -1
+    int int_scale_ = -1,
+    double pnan_ = 0,
+    bool exclude_zero_ = false
  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
+    , bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
+    , bernoulli_dist(pnan_)
+    , exclude_zero(exclude_zero_) 
+    {
      std::srand((unsigned)seed);
-    }
+      
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero) {
+        min = (min == 0.0) ? min + 1: min;
+        range = (max == 0.0) ? range - 1: range; 
+      }
+  }


  /// Compute random value and update RNG state
-  Element operator()() const {
+  Element operator()() {
+
+    // Sample from NaN distribution.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
+        return Element(NAN);
+      }
+    }

    double rnd = double(std::rand()) / double(RAND_MAX);

@ -575,6 +645,15 @@ struct RandomUniformFunc {
      result = static_cast<Element>(Real(rnd));
    }

+    if (exclude_zero && result == Element(0)) {
+      if (rnd > 0.0) {
+        rnd = std::min(min + range, rnd + 1.0);
+      } else {
+        rnd = std::max(min, rnd - 1.0);
+      }
+      result = static_cast<Element>(Real(rnd));
+    }
+
    return result;
  }
 };
@ -590,6 +669,15 @@ struct RandomUniformFunc<complex<Element> > {
  double min;
  int int_scale;

+  double pnan;
+private:
+  using engine_type = std::mt19937;
+public:
+  engine_type bernoulli_rnd;
+  std::bernoulli_distribution bernoulli_dist;
+
+  bool exclude_zero;
+
  //
  // Methods
  //
@ -598,15 +686,33 @@ struct RandomUniformFunc<complex<Element> > {
    uint64_t seed_ = 0, 
    double max = 1,
    double min_ = 0,
-    int int_scale_ = -1
+    int int_scale_ = -1,
+    double pnan_ = 0,
+    bool exclude_zero_ = false
  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
+    , bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
+    , bernoulli_dist(pnan_)
+    , exclude_zero(exclude_zero_) {
      std::srand((unsigned)seed);
-    }
+
+      // Handle cases where min = 0 or max = 0 for excluding zeros
+      if (exclude_zero) {
+        min = (min == 0.0) ? min + 1: min;
+        range = (max == 0.0) ? range - 1: range; 
+      }
+  }


  /// Compute random value and update RNG state
-  complex<Element> operator()() const {
+  complex<Element> operator()() {
+
+    // Sample from NaN distribution.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
+        return Element(NAN);
+      }
+    }

    Element reals[2];

@ -625,6 +731,19 @@ struct RandomUniformFunc<complex<Element> > {
      else {
        reals[i] = from_real<Element>(Real(rnd));
      }
+
+      if (exclude_zero && 
+          i == 0 &&
+          reals[0] == from_real<Element>(0.0)) {
+
+        if (rnd > 0.0) {
+          rnd = std::min(min + range, rnd + 1.0);
+        } else {
+          rnd = std::max(min, rnd - 1.0);
+        }
+        reals[0] = from_real<Element>(Real(rnd));
+      }
+
    }

    return complex<Element>(reals[0], reals[1]);
@ -642,6 +761,13 @@ struct RandomUniformFunc<Quaternion<Element> > {
  double min;
  int int_scale;

+  double pnan;
+private:
+  using engine_type = std::mt19937;
+public:
+  engine_type bernoulli_rnd;
+  std::bernoulli_distribution bernoulli_dist;
+
  //
  // Methods
  //
@ -650,15 +776,26 @@ struct RandomUniformFunc<Quaternion<Element> > {
    uint64_t seed_ = 0,
    double max = 1,
    double min_ = 0,
-    int int_scale_ = -1
+    int int_scale_ = -1,
+    double pnan_ = 0
  ):
-    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
-      std::srand((unsigned)seed);
-    }
+    seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_),
+    bernoulli_rnd{static_cast<engine_type::result_type>(seed_)},
+    bernoulli_dist(pnan_)
+  {
+    std::srand((unsigned)seed);
+  }


  /// Compute random value and update RNG state
-  Quaternion<Element> operator()() const {
+  Quaternion<Element> operator()() {
+
+    // Sample from NaN distribution.
+    if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
+      if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
+        return Element(NAN);
+      }
+    }

    Element reals[4];

@ -712,7 +849,7 @@ struct TensorFillRandomUniformFunc {
  }

  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) const {
+  void operator()(Coord<Layout::kRank> const &coord) {

    view.at(coord) = func();
  }
@ -749,7 +886,7 @@ struct TensorFillSymmetricRandomUniformFunc {
  }

  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) const {
+  void operator()(Coord<Layout::kRank> const &coord) {
    // Fill half of matrix based on FillMode
    if (Layout::kRank == 2 && 
        fill_mode == cutlass::FillMode::kLower &&
@ -796,7 +933,7 @@ struct TensorFillPadDiagonalRandomUniformFunc {
  }

  /// Compute random value and update RNG state
-  void operator()(Coord<Layout::kRank> const &coord) const {
+  void operator()(Coord<Layout::kRank> const &coord) {
    // Fill half of matrix based on FillMode
    if (Layout::kRank == 2 && 
        (fill_mode == cutlass::FillMode::kLower) &&
@ -825,10 +962,12 @@ void TensorFillRandomUniform(
  uint64_t seed,                          ///< seed for RNG
  double max = 1,                         ///< upper bound of distribution
  double min = 0,                         ///< lower bound for distribution
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.                 
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+                                          ///  data.
+  double pnan = 0,                        ///< Percentage of NaN elements.
+  bool exclude_zero = false) {            ///< Exclude zero from tensor init  
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan, exclude_zero);

  detail::TensorFillRandomUniformFunc<Element, Layout> func(
    dst,
@ -850,12 +989,14 @@ void TensorFillRandomUniform(
  uint64_t seed,                                       ///< seed for RNG
  double max = 1,                                      ///< upper bound of distribution
  double min = 0,                                      ///< lower bound for distribution
-  int bits = -1) {                                     ///< If non-negative, specifies number of fractional bits that
+  int bits = -1,                                       ///< If non-negative, specifies number of fractional bits that
                                                       ///  are not truncated to zero. Permits reducing precision of
                                                       ///  data.
+  double pnan = 0,                                     ///< Percentage of NaN elements.
+  bool exclude_zero = false) {                         ///< Exclude zero from tensor init 

-  TensorFillRandomUniform(dst.view_real(), seed, max, min, bits);
-  TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits);
+  TensorFillRandomUniform(dst.view_real(), seed, max, min, bits, pnan, exclude_zero);
+  TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits, pnan, exclude_zero);
 }


@ -972,10 +1113,11 @@ void BlockFillRandomUniform(
  uint64_t seed,                          ///< seed for RNG
  double max = 1,                         ///< upper bound of distribution
  double min = 0,                         ///< lower bound for distribution
-  int bits = -1) {                        ///< If non-negative, specifies number of fractional bits that 
+  int bits = -1,                          ///< If non-negative, specifies number of fractional bits that 
                                          ///  are not truncated to zero. Permits reducing precision of
-                                          ///  data.                 
-  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
+                                          ///  data.
+  double pnan = 0) {                      ///< Percentage of NaN elements.
+  detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan);

  for (size_t i = 0; i < capacity; ++i) {
    ReferenceFactory<Element>::get(ptr, i) = random_func();
@ -1259,7 +1401,11 @@ template <
 void TensorFillRandom(
  TensorView<Element, Layout> view,       ///< destination tensor
  uint64_t seed,
-  Distribution dist) {
+  Distribution dist,
+  bool exclude_zero = false               ///< If true, excludes 0.
+                                          ///  Note that setting this flag will result in more 1's,
+                                          ///  as we use a simple mechanism to replace 0's by adding/subtracting 1's.
+) {

  using Real = typename RealType<Element>::Type;

@ -1269,14 +1415,18 @@ void TensorFillRandom(
      seed,
      dist.gaussian.mean,
      dist.gaussian.stddev,
-      dist.int_scale);
+      dist.int_scale,
+      dist.gaussian.pnz,
+      exclude_zero);
  } else if (dist.kind == Distribution::Uniform) {
    TensorFillRandomUniform(
      view,
      seed,
      dist.uniform.max,
      dist.uniform.min,
-      dist.int_scale);
+      dist.int_scale,
+      dist.uniform.pnan,
+      exclude_zero);
  }
 }

@ -1354,7 +1504,8 @@ void BlockFillRandom(
      seed, 
      dist.uniform.max,
      dist.uniform.min, 
-      dist.int_scale);
+      dist.int_scale,
+      dist.uniform.pnan);
  }
 }