CUTLASS 3.6.0 (#1850)

* v3.6

* update changelog

* update readme

* fix typo

* fixing typos

* hopper gemm with weight prefetch

---------

Co-authored-by: yuzhai <yuzhai@nvidia.com>
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
This commit is contained in:
Yujia Zhai
2024-10-09 12:33:27 -07:00
committed by GitHub
parent 0837a2a00a
commit cc3c29a81a
354 changed files with 105943 additions and 8203 deletions

View File

@ -221,7 +221,8 @@ cutlass_add_cutlass_library(
# files split for parallel compilation
src/reference/gemm_int4.cu
src/reference/gemm_int8_canonical.cu
src/reference/gemm_s8_s8_s32.cu
src/reference/gemm_u8_u8_s32.cu
src/reference/gemm_int8_interleaved_32.cu
src/reference/gemm_int8_interleaved_64.cu
src/reference/gemm_e4m3a_e4m3out.cu
@ -278,6 +279,7 @@ execute_process(
--generator-target library
--architectures "${CUTLASS_NVCC_ARCHS_ENABLED}"
--kernels "${CUTLASS_LIBRARY_KERNELS}"
--instantiation-level "${CUTLASS_LIBRARY_INSTANTIATION_LEVEL}"
--ignore-kernels "${CUTLASS_LIBRARY_IGNORE_KERNELS}"
--exclude-kernels "${CUTLASS_LIBRARY_EXCLUDE_KERNELS}"
--kernel-filter-file "${KERNEL_FILTER_FILE}"

View File

@ -113,6 +113,12 @@ template <> struct ArchMap<arch::Sm90, arch::OpClassTensorOp> {
static int const kMax = 90;
};
// Arch conditional sparse WGMMA
template <> struct ArchMap<arch::Sm90, arch::OpClassSparseTensorOp> {
static int const kMin = 90;
static int const kMax = 90;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace library

View File

@ -103,6 +103,17 @@ public:
void *device_workspace = nullptr,
cudaStream_t stream = nullptr) const = 0;
// Originally designed for metadata, but should be useful for FP8/6/4 too.
virtual Status initialize_with_profiler_workspace(
void const *configuration,
void *host_workspace,
void *device_workspace,
uint8_t **profiler_workspace_ptrs,
int problem_count,
cudaStream_t stream = nullptr) {
return Status::kErrorNotSupported;
}
virtual Status run(
void const *arguments,
void *host_workspace,
@ -290,7 +301,6 @@ struct GemmUniversalArguments {
// Needed for some 3.x kernels
int sm_count{0};
library::RasterOrder raster_order{};
int swizzle_size{1};
};

View File

@ -616,7 +616,7 @@ private:
/* traversal_stride = */ {traversal_stride_h, traversal_stride_w},
/* dilation = */ {dilation_h, dilation_w},
num_groups);
out_args.mainloop.problem_shape = problem_shape;
out_args.problem_shape = problem_shape;
// ConvProblemShape's constructor sets its shape_C member.
#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
@ -788,7 +788,7 @@ private:
/* traversal_stride = */ {traversal_stride_d, traversal_stride_h, traversal_stride_w},
/* dilation = */ {dilation_d, dilation_h, dilation_w},
num_groups);
out_args.mainloop.problem_shape = problem_shape;
out_args.problem_shape = problem_shape;
// ConvProblemShape's constructor sets its shape_C member.
#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)

View File

@ -249,7 +249,6 @@ protected:
/* Query device SM count to pass onto the kernel as an argument, where needed */
operator_args.hw_info.sm_count = arguments->sm_count;
if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
}
@ -282,17 +281,18 @@ public:
static_cast<GemmUniversalArguments const *>(arguments_ptr);
OperatorArguments args;
auto status = update_arguments_(args, arguments);
if (status != Status::kSuccess) {
return status;
}
// can_implement rules may need access to problem shape
args.problem_shape = cute::make_shape(
configuration->problem_size.m(),
configuration->problem_size.n(),
configuration->problem_size.k(),
configuration->batch_count);
auto status = update_arguments_(args, arguments);
if (status != Status::kSuccess) {
return status;
}
return Operator::can_implement(args);
}

View File

@ -121,14 +121,14 @@ void initialize_gemm_reference_operations_fp_mixed_input(Manifest &manifest) {
half_t,
int8_t,
half_t,
float
float
>(manifest);
make_gemm_real_canonical_layouts<
half_t,
uint8_t,
half_t,
float
float
>(manifest);
// bfloat16_t mixed with 8-bit integer input

View File

@ -54,6 +54,14 @@ void initialize_gemm_reference_operations_fp_other(Manifest &manifest) {
half_t
>(manifest);
make_gemm_real_canonical_layouts<
half_t,
half_t,
float,
half_t,
half_t
>(manifest);
make_gemm_real_canonical_layouts<
double,
double,

View File

@ -73,7 +73,7 @@ void initialize_gemm_reference_operations_int_mixed_input(Manifest &manifest) {
int32_t,
NumericConverterClamp<int32_t, float>
>(manifest);
make_gemm_real_canonical_layouts<
int4b_t,
int8_t,
@ -110,7 +110,7 @@ void initialize_gemm_reference_operations_int_mixed_input(Manifest &manifest) {
int32_t,
NumericConverterClamp<int32_t, float>
>(manifest);
make_gemm_real_canonical_layouts<
int8_t,
int4b_t,

View File

@ -0,0 +1,146 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Instantiates GEMM reference implementations.
*/
#include "cutlass/cutlass.h"
#include "cutlass/library/library.h"
#include "cutlass/library/manifest.h"
#include "gemm_reference_operation.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace library {
///////////////////////////////////////////////////////////////////////////////////////////////////
// A/B: s8
// Acc : s32
// C/D: some variance
// Epi Scalar: some variance
// 1. s8_s8_s32_s32_s32 (s32 epi scalar)
// 2. s8_s8_s32_s32_s32 (f32 epi scalar)
// 3. s8_s8_s32_s8_s8 (f32 epi scalar)
// 4. s8_s8_s32_s8_s8 (s32 epi scalar)
// 5. s8_s8_s32_s32_s8 (f32 epi scalar)
// 6. s8_s8_s32_f32_f32
// 7. s8_s8_s32_f16_f16 (f32 epi scalar)
// D = convert( Scalar(alpha) * Scalar( A * B ) + Scalar(beta) * Scalar( C ) )
// Convert: from epi Scalar dtype to D dtype
void initialize_gemm_reference_operations_s8_s8_s32(Manifest &manifest) {
// 1.
make_gemm_real_canonical_layouts<
int8_t, // ElementA
int8_t, // ElementB
int32_t, // ElementC
int32_t, // ElementScalar / ElementCompute
int32_t, // ElementAccumulator
int32_t // ElementD
>(manifest);
// 2.
make_gemm_real_canonical_layouts<
int8_t, // ElementA
int8_t, // ElementB
int32_t, // ElementC
int32_t, // ElementScalar / ElementCompute
int32_t, // ElementAccumulator
int32_t // ElementD
>(manifest);
// 3.
make_gemm_real_canonical_layouts<
int8_t, // ElementA
int8_t, // ElementB
int8_t, // ElementC
float, // ElementScalar / ElementCompute
int32_t, // ElementAccumulator
int8_t, // ElementD
NumericConverterClamp<int8_t, float> // From Scalar to D
>(manifest);
// 4.
make_gemm_real_canonical_layouts<
int8_t, // ElementA
int8_t, // ElementB
int8_t, // ElementC
int32_t, // ElementScalar / ElementCompute
int32_t, // ElementAccumulator
int8_t, // ElementD
NumericConverterClamp<int8_t, int32_t> // From Scalar to D
>(manifest);
// 5.
make_gemm_real_canonical_layouts<
int8_t, // ElementA
int8_t, // ElementB
int32_t, // ElementC
float, // ElementScalar / ElementCompute
int32_t, // ElementAccumulator
int8_t, // ElementD
NumericConverterClamp<int8_t, float> // From Scalar to D
>(manifest);
// 6.
make_gemm_real_canonical_layouts<
int8_t, // ElementA
int8_t, // ElementB
float, // ElementC
float, // ElementScalar / ElementCompute
int32_t, // ElementAccumulator
float // ElementD
>(manifest);
// 7.
make_gemm_real_canonical_layouts<
int8_t, // ElementA
int8_t, // ElementB
half_t, // ElementC
float, // ElementScalar / ElementCompute
int32_t, // ElementAccumulator
half_t, // ElementD
NumericConverterClamp<half_t, float> // From Scalar to D
>(manifest);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace library
} // namespace cutlass
///////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -45,72 +45,48 @@ namespace library {
///////////////////////////////////////////////////////////////////////////////////////////////////
void initialize_gemm_reference_operations_int8_canonical(Manifest &manifest) {
// A/B: u8
// Acc : s32
// C/D: some variance
// 1. u8_u8_s32_s32_s32 (s32 epi scalar)
// 2. u8_u8_s32_s32_s32 (f32 epi scalar)
// 3. u8_8_s32_s8_s8 (f32 epi scalar)
// 3. u8_8_s32_s8_s8 (s epi scalar)
void initialize_gemm_reference_operations_u8_u8_s32(Manifest &manifest) {
// 1.
make_gemm_real_canonical_layouts<
int8_t,
int8_t,
int32_t,
int32_t,
int32_t
uint8_t, // ElementA
uint8_t, // ElementB
int32_t, // ElementC
int32_t, // ElementScalar / ElementCompute
int32_t, // ElementAccumulator
int32_t // ElementD
>(manifest);
// 2.
make_gemm_real_canonical_layouts<
int8_t,
int8_t,
int8_t,
float,
int32_t,
int8_t,
NumericConverterClamp<int8_t, float>
uint8_t, // ElementA
uint8_t, // ElementB
int32_t, // ElementC
float, // ElementScalar / ElementCompute
int32_t, // ElementAccumulator
int32_t, // ElementD
NumericConverterClamp<int32_t, float> // From Scalar to D
>(manifest);
// 3.
make_gemm_real_canonical_layouts<
int8_t,
int8_t,
int32_t,
float,
int32_t,
int32_t,
NumericConverterClamp<int32_t, float>
uint8_t, // ElementA
uint8_t, // ElementB
int8_t, // ElementC
float, // ElementScalar / ElementCompute
int32_t, // ElementAccumulator
int8_t, // ElementD
NumericConverterClamp<int8_t, float> // From Scalar to D
>(manifest);
make_gemm_real_canonical_layouts<
uint8_t,
uint8_t,
int32_t,
int32_t,
int32_t
>(manifest);
make_gemm_real_canonical_layouts<
uint8_t,
uint8_t,
int8_t,
float,
int32_t,
int8_t,
NumericConverterClamp<int8_t, float>
>(manifest);
make_gemm_real_canonical_layouts<
uint8_t,
uint8_t,
int32_t,
float,
int32_t,
int32_t,
NumericConverterClamp<int32_t, float>
>(manifest);
make_gemm_real_canonical_layouts<
int8_t,
int8_t,
int8_t,
int32_t,
int32_t,
int8_t,
NumericConverterClamp<int8_t, int32_t>
>(manifest);
}
///////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -46,7 +46,8 @@ namespace library {
void initialize_gemm_reference_operations_int4(Manifest &manifest);
void initialize_gemm_reference_operations_int8_interleaved_32(Manifest &manifest);
void initialize_gemm_reference_operations_int8_interleaved_64(Manifest &manifest);
void initialize_gemm_reference_operations_int8_canonical(Manifest &manifest);
void initialize_gemm_reference_operations_s8_s8_s32(Manifest &manifest);
void initialize_gemm_reference_operations_u8_u8_s32(Manifest &manifest);
void initialize_gemm_reference_operations_e4m3a_e4m3out(Manifest &manifest);
void initialize_gemm_reference_operations_e5m2a_e4m3out(Manifest &manifest);
void initialize_gemm_reference_operations_e4m3a_e5m2out(Manifest &manifest);
@ -72,7 +73,8 @@ void initialize_reference_operations(Manifest &manifest) {
initialize_gemm_reference_operations_int8_interleaved_32(manifest);
initialize_gemm_reference_operations_int8_interleaved_64(manifest);
initialize_gemm_reference_operations_int8_canonical(manifest);
initialize_gemm_reference_operations_s8_s8_s32(manifest);
initialize_gemm_reference_operations_u8_u8_s32(manifest);
initialize_gemm_reference_operations_e4m3a_e4m3out(manifest);
initialize_gemm_reference_operations_e5m2a_e4m3out(manifest);
@ -85,7 +87,6 @@ void initialize_reference_operations(Manifest &manifest) {
initialize_gemm_reference_operations_fp32out(manifest);
initialize_gemm_reference_operations_fp_other(manifest);
initialize_gemm_reference_operations_fp_mixed_input(manifest);
initialize_gemm_reference_operations_int_mixed_input(manifest);
}

View File

@ -0,0 +1,445 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Defines operations for all GEMM operation kinds in CUTLASS Library.
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/library/library.h"
#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp" // StructuredSparseCompressor
#include "cutlass/transform/device/transform_universal_adapter.hpp" // TransformUniversalAdapter
#include "cutlass/util/packed_stride.hpp" // make_cute_packed_stride
#include "gemm_operation_3x.hpp"
#include "library_internal.h"
///////////////////////////////////////////////////////////////////////////////////////////////////
#define CUDA_CHECK(cuda_error) \
{ \
if (cuda_error != cudaSuccess) { \
printf("cudaError %s in %s:%d\n", cudaGetErrorString(cuda_error), __func__, __LINE__ ); \
return Status::kInvalid; \
} \
}
namespace cutlass::library {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Limitation & Assumptions:
// 1. The tensor must be densely packed. That is, lda is k if the tensor is k-major,
// and lda is m if the tensor is m-major.
// 2. Circular buffer for tensorA and tensorE may have a less count compared to tensorB and others.
// This is because we can not get the problem_count information in the get_device_workspace_size().
// But I can promise it will use at least 192MB memory if we enable circular buffer.
template <typename Operator_>
class SparseGemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
public:
using Operator = Operator_;
using OperatorArguments = typename Operator::Arguments;
using ElementA = typename Operator::ElementA;
using LayoutA = typename Operator::LayoutA;
using ElementB = typename Operator::ElementB;
using LayoutB = typename Operator::LayoutB;
using ElementC = typename Operator::ElementC;
using LayoutC = typename Operator::LayoutC;
using ElementD = typename Operator::ElementD;
using LayoutD = typename Operator::LayoutD;
using ElementAccumulator = typename Operator::ElementAccumulator;
using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
using CollectiveMainloop = typename Operator::CollectiveMainloop;
using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
using ElementE = typename CollectiveMainloop::ElementE;
using LayoutE = typename CollectiveMainloop::LayoutE;
using SparseConfig = typename CollectiveMainloop::SparseConfig;
using LayoutATag = decltype(SparseConfig::deduce_layoutA_tag(typename CollectiveMainloop::LayoutA{}));
using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
cute::Shape<int, int, int, int>,
ElementA,
LayoutATag,
SparseConfig>;
using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
cute::Shape<int, int, int, int>,
ElementA,
LayoutATag,
SparseConfig,
typename Operator::ArchTag>;
using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
public:
/// Constructor
SparseGemmUniversal3xOperation(char const *name = "unknown_gemm"):
GemmOperation3xBase<Operator_>(name, GemmKind::kUniversal) {}
protected:
/// Constructs the arguments structure given the configuration and arguments
static Status construct_arguments_(
OperatorArguments &operator_args, GemmUniversalConfiguration const *configuration) {
// NOTE: GemmUniversalConfiguration does not contain problem shapes or batch strides
// Do nothing here and construct kernel arguments in update_arguments_ instead
// We also cannot construct TMA descriptors without all the arguments available
operator_args.mode = configuration->mode;
return Status::kSuccess;
}
template<class FusionArgs, class = void>
struct UpdateFusionArgs {
static Status update_(FusionArgs const& fusion_args, GemmUniversalArguments const &arguments) {
// If a custom EVT is instantiated then it is the users's responsibility
// to ensure alpha and beta are updated appropriately
return Status::kSuccess;
}
};
template<class FusionArgs>
struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
static Status update_(FusionArgs& fusion_args, GemmUniversalArguments const &arguments) {
if (arguments.pointer_mode == ScalarPointerMode::kHost) {
fusion_args.alpha = *static_cast<ElementCompute const *>(arguments.alpha);
fusion_args.beta = *static_cast<ElementCompute const *>(arguments.beta);
fusion_args.alpha_ptr = nullptr;
fusion_args.beta_ptr = nullptr;
return Status::kSuccess;
}
else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
fusion_args.alpha = 0;
fusion_args.beta = 0;
fusion_args.alpha_ptr = static_cast<ElementCompute const *>(arguments.alpha);
fusion_args.beta_ptr = static_cast<ElementCompute const *>(arguments.beta);
return Status::kSuccess;
}
else {
return Status::kErrorInvalidProblem;
}
}
};
/// Constructs the arguments structure given the configuration and arguments
static Status update_arguments_(
OperatorArguments &operator_args,
GemmUniversalArguments const *arguments,
CompressorUtility const& compressor_utility,
void* device_a_compressed_ptr = nullptr,
void* device_e_ptr = nullptr) {
Status status = Status::kSuccess;
status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
operator_args.epilogue.thread, *arguments);
if (status != Status::kSuccess) {
return status;
}
// TODO: type erase Arguments structure in 3.0 GEMM
operator_args.problem_shape = cute::make_shape(
arguments->problem_size.m(),
arguments->problem_size.n(),
arguments->problem_size.k(),
arguments->batch_count);
// update arguments
operator_args.mainloop.ptr_A = reinterpret_cast<ElementA const *>(device_a_compressed_ptr);
operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
operator_args.mainloop.ptr_E = reinterpret_cast<ElementE const *>(device_e_ptr);
operator_args.epilogue.ptr_C = static_cast<ElementC const *>(arguments->C);
operator_args.epilogue.ptr_D = static_cast<ElementD *>(arguments->D);
operator_args.mainloop.layout_a = compressor_utility.fill_layoutA_from_compressor();
operator_args.mainloop.layout_e = compressor_utility.fill_layoutE_from_compressor();
operator_args.mainloop.dB = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideB>(
arguments->ldb, arguments->batch_stride_B);
operator_args.epilogue.dC = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideC>(
arguments->ldc, arguments->batch_stride_C);
operator_args.epilogue.dD = operator_args.epilogue.dC;
/* Query device SM count to pass onto the kernel as an argument, where needed */
operator_args.hw_info.sm_count = arguments->sm_count;
if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
}
if constexpr (!std::is_const_v<decltype(operator_args.scheduler.raster_order)>) {
using Enum_t = decltype(operator_args.scheduler.raster_order);
switch (arguments->raster_order) {
case RasterOrder::kAlongN:
operator_args.scheduler.raster_order = Enum_t::AlongN;
break;
case RasterOrder::kAlongM:
operator_args.scheduler.raster_order = Enum_t::AlongM;
break;
default:
operator_args.scheduler.raster_order = Enum_t::Heuristic;
}
}
return status;
}
public:
/// Returns success if the operation can proceed
Status can_implement(
void const *configuration_ptr, void const *arguments_ptr) const override {
GemmUniversalConfiguration const *configuration =
static_cast<GemmUniversalConfiguration const *>(configuration_ptr);
GemmUniversalArguments const *arguments =
static_cast<GemmUniversalArguments const *>(arguments_ptr);
OperatorArguments args;
auto problem_shape_MNKL = cute::make_shape(
configuration->problem_size.m(),
configuration->problem_size.n(),
configuration->problem_size.k(),
configuration->batch_count);
const int M = configuration->problem_size.m();
const int N = configuration->problem_size.n();
const int K = configuration->problem_size.k();
const int L = configuration->batch_count;
using StrideA = typename CompressorUtility::StrideA;
auto dA = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
compressor_utility.set_problem_size(problem_shape_MNKL, dA);
auto status = update_arguments_(args, arguments, compressor_utility);
if (status != Status::kSuccess) {
return status;
}
// can_implement rules may need access to problem shape
args.problem_shape = problem_shape_MNKL;
return Operator::can_implement(args);
}
/// Gets the host-side workspace
uint64_t get_host_workspace_size(void const *) const override {
// Memory to hold operator
host_op_workspace_size = sizeof(Operator);
// Memory to hold result of `.structure_sparse_zero_mask_fill()`
tensor_a_size = compressor_utility.get_raw_tensor_A_bytes();
// NOTE: order here is the order of workspace partition
const uint64_t size = host_op_workspace_size + tensor_a_size;
return size;
}
/// Gets the device-side workspace
uint64_t get_device_workspace_size(
void const *configuration_ptr,void const *arguments_ptr) const override {
OperatorArguments args;
auto status = update_arguments_(
args, static_cast<GemmUniversalArguments const *>(arguments_ptr), compressor_utility);
if (status != Status::kSuccess) {
return 0;
}
typename Compressor::Arguments compress_arguments {
{compressor_utility.M, 0, compressor_utility.K, compressor_utility.L},
{/*Empty Not Use*/},
{/*Empty Not Use*/} };
// Size for one iteration
// For multi-iteration, will need to multiply result of this function w/ actual problem_count
tensor_ac_size = compressor_utility.get_compressed_tensor_A_bytes();
tensor_e_size = compressor_utility.get_tensor_E_bytes();
device_op_workspace_size = Operator::get_workspace_size(args);
device_compress_workspace_size = Compressor::get_workspace_size(compress_arguments);
// NOTE: order here is the order of workspace partition
device_per_iter_workspace_size = device_op_workspace_size + device_compress_workspace_size + tensor_ac_size + tensor_e_size;
return device_per_iter_workspace_size;
}
/// Initializes the workspace
Status initialize(
void const *configuration_ptr,
void *host_workspace,
void *device_workspace,
cudaStream_t stream = nullptr) const override {
return Status::kErrorInternal;
}
Status initialize_with_profiler_workspace(
void const *configuration,
void *host_workspace,
void *device_workspace,
uint8_t **profiler_workspaces,
int problem_count_from_profiler,
cudaStream_t stream = nullptr) {
// Set problem_count.
problem_count = problem_count_from_profiler;
// * Host Ptr
auto* host_op_workspace_ptr = reinterpret_cast<uint8_t*>(host_workspace);
auto* host_a_raw_ptr = host_op_workspace_ptr + host_op_workspace_size;
// * Construct Op
Operator *op = new (host_op_workspace_ptr) Operator;
// * Device Full Ptr
device_full_ptr = reinterpret_cast<uint8_t*>(device_workspace);
// * Device Ptr (1st iteration)
// Device workspace : | iter1 | iter2 | iter3 | .. | iterx |
// iteri : op_workspace | tensor_ac | tensor_e
auto* device_ptr_iter1 = device_full_ptr;
auto* device_op_workspace_ptr_iter1 = device_ptr_iter1;
auto* device_compressor_workspace_ptr_iter1 = device_op_workspace_ptr_iter1 + device_op_workspace_size;
auto* device_a_compressed_ptr_iter1 = device_compressor_workspace_ptr_iter1 + device_compress_workspace_size;
auto* device_e_ptr_iter1 = device_a_compressed_ptr_iter1 + tensor_ac_size;
// * Device A Raw Ptr
auto* device_a_raw_ptr = profiler_workspaces[0];
// * Random fill 50% of TensorA w/ zero following the structured sparse requirement
cudaMemcpy(host_a_raw_ptr, device_a_raw_ptr, tensor_a_size, cudaMemcpyDeviceToHost);
compressor_utility.structure_sparse_zero_mask_fill(host_a_raw_ptr, 2000);
cudaMemcpy(device_a_raw_ptr, host_a_raw_ptr, tensor_a_size, cudaMemcpyHostToDevice);
CUDA_CHECK(cudaGetLastError());
// * Compress DTensorA and get DTensorAC & DTensorE
cutlass::KernelHardwareInfo hw_info;
hw_info.device_id = 0;
hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
typename Compressor::Arguments arguments{
{compressor_utility.M, 0, compressor_utility.K, compressor_utility.L},
{device_a_raw_ptr,
compressor_utility.dA,
device_a_compressed_ptr_iter1,
device_e_ptr_iter1},
{hw_info}
};
cutlass::Status status {cutlass::Status::kSuccess };
Compressor compressor_op;
status = compressor_op.can_implement(arguments);
if (status != Status::kSuccess) {
return status;
}
status = compressor_op.initialize(arguments, device_compressor_workspace_ptr_iter1, stream);
if (status != Status::kSuccess) {
return status;
}
status = compressor_op.run(stream);
if (status != Status::kSuccess) {
return status;
}
CUDA_CHECK(cudaStreamSynchronize(stream));
// * Copy Iter1's DTensorAC DTensorE to each iteration's DTensorAC DTensorE
for (int iter_i = 1; iter_i < problem_count; iter_i++) {
// * Device AC E Ptr per iteration
// Device workspace : | iter1 | iter2 | iter3 | .. | iterx |
// iteri : op_workspace | tensor_ac | tensor_e
auto* device_ptr_iteri = device_full_ptr + device_per_iter_workspace_size * iter_i;
auto* device_op_workspace_ptr = device_ptr_iteri;
auto* device_compressor_workspace_ptr = device_op_workspace_ptr + device_op_workspace_size;
auto* device_a_compressed_ptr = device_compressor_workspace_ptr + device_compress_workspace_size;
auto* device_e_ptr = device_a_compressed_ptr + tensor_ac_size;
cudaMemcpy(device_a_compressed_ptr, device_a_compressed_ptr_iter1, tensor_ac_size, cudaMemcpyDeviceToDevice);
cudaMemcpy(device_e_ptr, device_e_ptr_iter1, tensor_e_size, cudaMemcpyDeviceToDevice);
}
CUDA_CHECK(cudaGetLastError());
return Status::kSuccess;
}
/// Runs the kernel
Status run(
void const *arguments_ptr,
void *host_workspace,
void *device_workspace = nullptr,
cudaStream_t stream = nullptr) const override {
OperatorArguments operator_args;
auto* device_ptr_iteri = device_full_ptr + device_per_iter_workspace_size * iter_idx;
auto* device_op_workspace_ptr = device_ptr_iteri;
auto* device_compressor_workspace_ptr = device_op_workspace_ptr + device_op_workspace_size;
auto* device_a_compressed_ptr = device_compressor_workspace_ptr + device_compress_workspace_size;
auto* device_e_ptr = device_a_compressed_ptr + tensor_ac_size;
iter_idx = (iter_idx + 1) % problem_count;
Status status = update_arguments_(operator_args, static_cast<GemmUniversalArguments const *>(arguments_ptr), compressor_utility, device_a_compressed_ptr, device_e_ptr );
if (status != Status::kSuccess) {
return status;
}
Operator *op = static_cast<Operator *>(host_workspace);
// We need to call initialize() since we have to rebuild TMA desc for every new set of args
status = op->run(operator_args, device_op_workspace_ptr, stream);
return status;
}
private:
// Variables that must change in the const functions.
mutable CompressorUtility compressor_utility;
mutable int problem_count = 1;
mutable int iter_idx = 0;
uint8_t* device_full_ptr = nullptr;
mutable uint64_t tensor_ac_size = 0;
mutable uint64_t tensor_e_size = 0;
mutable uint64_t tensor_a_size = 0;
mutable uint64_t host_op_workspace_size = 0;
mutable uint64_t device_compress_workspace_size = 0;
mutable uint64_t device_op_workspace_size = 0;
mutable uint64_t device_per_iter_workspace_size = 0;
};
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace cutlass::library
///////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -756,6 +756,7 @@ OpcodeClassID_enumerants[] = {
{"tensorop", "<tensorop>", OpcodeClassID::kTensorOp},
{"wmmatensorop", "<wmmatensorop>", OpcodeClassID::kWmmaTensorOp},
{"wmma", "<wmma>", OpcodeClassID::kWmmaTensorOp},
{"sptensorop", "<sptensorop>", OpcodeClassID::kSparseTensorOp}
};
/// Converts a OpcodeClassID enumerant to a string

View File

@ -36,6 +36,7 @@
#if CUTLASS_ENABLE_CUBLAS
#include <cublas_v2.h>
#include <cublasLt.h>
#include "cutlass/cutlass.h"
#include "cutlass/library/library.h"
@ -90,25 +91,48 @@ Status cublas_satisfies(library::SymmDescription const &desc);
/// Additionally, it provides implicit cast from CublasCreate's object to cublasHandle_t's object
class CublasCreate {
private:
cublasHandle_t handle;
cublasStatus_t status;
cublasHandle_t handle;
cublasStatus_t status;
public:
CublasCreate() {
status = cublasCreate(&handle);
}
CublasCreate() {
status = cublasCreate(&handle);
}
~CublasCreate() {
cublasDestroy(handle);
}
~CublasCreate() {
cublasDestroy(handle);
}
/// Implicit cast CublasCreate object to cublasHandle_t
operator cublasHandle_t() const { return handle; }
/// Implicit cast CublasCreate object to cublasHandle_t
operator cublasHandle_t() const { return handle; }
/// returns cublasStatus_t for handle creation
cublasStatus_t get_cublas_create_status() { return status; }
/// returns cublasStatus_t for handle creation
cublasStatus_t get_cublas_create_status() { return status; }
};
/// This is a helper class to create cublasLtHandle_t automatically on CublasLtCreate object creation and
/// to destroy cublasLtHandle_t on CublasLtCreate object destruction.
/// Additionally, it provides implicit cast from CublasLtCreate's object to cublasLtHandle_t's object
class CublasLtCreate {
private:
cublasLtHandle_t handle;
cublasStatus_t status;
public:
CublasLtCreate() {
status = cublasLtCreate(&handle);
}
~CublasLtCreate() {
cublasLtDestroy(handle);
}
/// Implicit cast CublasLtCreate object to cublasLtHandle_t
operator cublasLtHandle_t() const { return handle; }
/// returns cublasLtStatus_t for handle creation
cublasStatus_t get_cublaslt_create_status() { return status; }
};
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace detail {
@ -226,6 +250,80 @@ struct cublasGemmExDispatcher {
cublasStatus_t operator()(cublasHandle_t handle);
};
/// Dispatcher to cublaslt kernels
//
struct cublasLtGemmExDispatcher {
//
// Data members
//
library::GemmDescription const &op_desc;
library::GemmUniversalConfiguration configuration;
library::GemmUniversalArguments arguments;
// cublas-specific data structures to fill cublas API call arguments
cublasOperation_t trans_A;
cublasOperation_t trans_B;
cudaDataType_t data_type_A;
cudaDataType_t data_type_B;
cudaDataType_t data_type_C;
cudaDataType_t compute_data_type = CUDA_R_32F;
//cublasLt-specific data structures
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
cublasLtMatmulPreference_t preference = NULL;
//is set by call to get_cublaslt_algo()
cublasLtMatmulHeuristicResult_t heuristicResult_;
void *workspace = nullptr;
Status status;
#if (__CUDACC_VER_MAJOR__ >= 11)
cublasComputeType_t compute_type;
#endif
//
// Methods
//
cublasLtGemmExDispatcher(
library::GemmDescription const &op_desc,
library::GemmUniversalConfiguration configuration_,
library::GemmUniversalArguments arguments_
);
/// Initialize the cublasLt variables
void initialize_cublaslt();
/// Runs auto-tuning for the cublas heuristics
bool get_cublaslt_algo(cublasLtHandle_t handle,
AlgorithmMode algorithm_mode
);
/// Executes GEMM using these arguments
cublasStatus_t operator()(cublasLtHandle_t handle);
~cublasLtGemmExDispatcher(){
// descriptors are no longer needed as all GPU work was already enqueued
if (preference) cublasLtMatmulPreferenceDestroy(preference);
if (Ddesc) cublasLtMatrixLayoutDestroy(Ddesc);
if (Cdesc) cublasLtMatrixLayoutDestroy(Cdesc);
if (Bdesc) cublasLtMatrixLayoutDestroy(Bdesc);
if (Adesc) cublasLtMatrixLayoutDestroy(Adesc);
if (operationDesc) cublasLtMatmulDescDestroy(operationDesc);
if (workspace) {
cudaFree(workspace);
}
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Dispatcher to cublas rank k update kernels

View File

@ -48,7 +48,7 @@ namespace profiler {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// CUTLASS Profiler application
/// CUTLASS Profiler application
class CutlassProfiler {
private:
@ -66,13 +66,10 @@ private:
/// Prints usage
void print_usage_(std::ostream &);
/// Prints usage
void print_options_(std::ostream &);
/// Initializes the device
void initialize_device_();
/// Enumerates all operations
void enumerate_();

View File

@ -81,6 +81,9 @@ private:
/// Buffer holding TensorRef instance to recently allocated memory
std::vector<uint8_t> tensor_ref_buffer_;
/// The device ID where the allocation is made
int device_;
public:
//
// Static member functions
@ -91,7 +94,7 @@ public:
/// Returns the stride of a packed layout
static std::vector<int64_t> get_packed_layout(
library::LayoutTypeID layout_id,
library::LayoutTypeID layout_id,
std::vector<int> const &extent);
/// returns the capacity needed
@ -103,16 +106,16 @@ public:
/// Returns true if two blocks have exactly the same value
static bool block_compare_equal(
library::NumericTypeID numeric_type,
void const *ptr_A,
void const *ptr_B,
library::NumericTypeID numeric_type,
void const *ptr_A,
void const *ptr_B,
size_t capacity);
/// Returns true if two blocks have approximately the same value
static bool block_compare_relatively_equal(
library::NumericTypeID numeric_type,
void const *ptr_A,
void const *ptr_B,
library::NumericTypeID numeric_type,
void const *ptr_A,
void const *ptr_B,
size_t capacity,
double epsilon,
double nonzero_floor);
@ -123,15 +126,19 @@ public:
//
DeviceAllocation();
DeviceAllocation(library::NumericTypeID type, size_t capacity);
DeviceAllocation(
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
library::NumericTypeID type,
size_t capacity,
int device = -1);
DeviceAllocation(
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int64_t> const &stride = std::vector<int64_t>(),
int batch_count = 1);
int batch_count = 1,
int device = -1);
~DeviceAllocation();
@ -142,9 +149,9 @@ public:
/// Allocates memory for a given layout and tensor
DeviceAllocation &reset(
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int64_t> const &stride = std::vector<int64_t>(),
int batch_count = 1);
@ -157,7 +164,7 @@ public:
/// Data type of contained elements
library::NumericTypeID type() const;
/// Pointer to start of device memory allocation
void *data() const;
@ -184,7 +191,7 @@ public:
/// Capacity of allocation in number of elements
size_t capacity() const;
/// Capacity of allocation in bytes
size_t bytes() const;
@ -205,7 +212,7 @@ public:
/// Initializes a host allocation to a random distribution using std::cout
void initialize_random_sparsemeta_host(int seed, int MetaSizeInBits);
/// Uniformly fills a tensor with a value when provided o.w. zero
void fill_device(double value);
@ -221,8 +228,12 @@ public:
/// Copies from an equivalent-sized tensor in device memory
void copy_to_host(void *ptr);
/// Writes a tensor to csv
/// Writes a tensor to csv
void write_tensor_csv(std::ostream &out);
private:
/// A wrapper that sets the device, performs malloc, and sets back
cudaError_t malloc(void** ptr, size_t size);
};
using DeviceAllocationList = std::list<DeviceAllocation>;

View File

@ -29,7 +29,7 @@
*
**************************************************************************************************/
/* \file
\brief
\brief
*/
#pragma once
@ -68,46 +68,52 @@ private:
/// Non-owning set of named allocations
AllocationMap allocations_;
public:
/// Allocates memory of a given type, capacity (elements), and name
DeviceAllocation *allocate_block(
Options const &options,
std::string const &name,
library::NumericTypeID type,
size_t capacity);
/// Allocates memory of a given type, capacity (elements), and name
DeviceAllocation *allocate_tensor(
std::string const &name,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int64_t> const &stride = std::vector<int64_t>(),
int batch_count = 1);
library::NumericTypeID type,
size_t capacity,
size_t device_index);
/// Allocates memory of a given type, capacity (elements), and name
DeviceAllocation *allocate_tensor(
Options const &options,
std::string const &name,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int64_t> const &stride,
int batch_count,
int seed_shift = 0);
size_t device_index);
/// Allocates memory for sparse meta data
DeviceAllocation *allocate_sparsemeta_tensor(
/// Allocates memory of a given type, capacity (elements), and name
DeviceAllocation *allocate_and_initialize_tensor(
Options const &options,
std::string const &name,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int64_t> const &stride,
int batch_count,
int seed_shift,
size_t device_index);
/// Allocates memory for sparse meta data
DeviceAllocation *allocate_and_initialize_sparsemeta_tensor(
Options const &options,
std::string const &name,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
library::NumericTypeID type_a,
std::vector<int> const &extent,
std::vector<int> const &extent,
std::vector<int64_t> const &stride,
int batch_count,
int seed_shift = 0);
int seed_shift,
size_t device_index);
/// Clears named allocations (but does not necessarily free memory)
void clear();

View File

@ -82,12 +82,16 @@ public:
struct Device {
/// Device ID
int device;
std::vector<int> devices;
/// Number of total devices
/// This is not set by the user, it is set by automatically
int num_devices;
/// CUDA Device properties
cudaDeviceProp properties;
std::vector<cudaDeviceProp> properties;
/// Total memory allocation on device
/// Total memory allocation on each device
size_t maximum_capacity;
//
@ -100,8 +104,11 @@ public:
void print_options(std::ostream &out, int indent = 0) const;
void print_device_info(std::ostream &out) const;
/// Returns the compute capability of the listed device (e.g. 61, 60, 70, 75)
int compute_capability() const;
/// Returns the device ID from a device index
int device_id(size_t device_index) const;
/// Returns the compute capability of the listed devices (e.g. 61, 60, 70, 75)
int compute_capability(int device_index) const;
};
/// Options related to initializing input tensors
@ -129,7 +136,7 @@ public:
//
explicit Initialization(CommandLine const &cmdline);
void print_usage(std::ostream &out) const;
void print_options(std::ostream &out, int indent = 0) const;
@ -171,13 +178,13 @@ public:
//
explicit Verification(CommandLine const &cmdline);
void print_usage(std::ostream &out) const;
void print_options(std::ostream &out, int indent = 0) const;
/// Returns true if a provider is enabled
bool provider_enabled(library::Provider provider) const;
/// Returns the index of a provider if its enabled
size_t index(library::Provider provider) const;
};
@ -225,7 +232,7 @@ public:
/// Returns the index of a provider if its enabled
size_t index(library::Provider provider) const;
};
/// Options related to reporting
struct Report {
@ -260,7 +267,7 @@ public:
//
explicit Report(CommandLine const &cmdline);
void print_usage(std::ostream &out) const;
void print_options(std::ostream &out, int indent = 0) const;
};
@ -282,7 +289,7 @@ public:
//
explicit About(CommandLine const &cmdline);
void print_usage(std::ostream &out) const;
void print_options(std::ostream &out, int indent = 0) const;
@ -303,7 +310,7 @@ public:
/// Vector of operation name substrings
std::vector<std::string> operation_names;
/// Vector of operation name substrings
std::vector<std::string> excluded_operation_names;

View File

@ -51,10 +51,10 @@ namespace profiler {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Ctor
Conv2dOperationProfiler::Conv2dOperationProfiler(Options const &options):
Conv2dOperationProfiler::Conv2dOperationProfiler(Options const &options):
OperationProfiler(
options,
library::OperationKind::kConv2d,
library::OperationKind::kConv2d,
{
{ArgumentTypeID::kEnumerated, {"conv_kind"}, "Convolutional operator (fprop, dgrad, wgrad)"},
{ArgumentTypeID::kInteger, {"n", "input_n"}, "Input N dimension of the Conv2d problem space"},
@ -165,13 +165,13 @@ int64_t Conv2dOperationProfiler::Conv2dProblem::flops(
int64_t flops_mainloop_ = int64_t(mnk.m()) * mnk.n() * mnk.k() * 2;
int64_t flops_epilogue_ = int64_t(mnk.m()) * int64_t(mnk.n()) * 2;
// Adjust mainloop flop for dgrad strided
if (operation_desc.conv_kind == library::ConvKind::kDgrad) {
flops_mainloop_ = flops_mainloop_ / (stride_h * stride_w);
}
int64_t flops_total_ = flops_mainloop_ + flops_epilogue_;
//complex-valued support
switch (operation_desc.tile_description.math_instruction.math_operation) {
case library::MathOperationID::kMultiplyAddComplex:
@ -188,14 +188,14 @@ int64_t Conv2dOperationProfiler::Conv2dProblem::flops(
/// Extracts the problem dimensions
Status Conv2dOperationProfiler::initialize_configuration(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::ConvDescription const &operation_desc =
library::ConvDescription const &operation_desc =
static_cast<library::ConvDescription const &>(operation->description());
if (!arg_as_int(problem_.n, "n", problem_space, problem)) {
@ -207,7 +207,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
// default value
problem_.h = 16;
}
if (!arg_as_int(problem_.w, "w", problem_space, problem)) {
// default value
problem_.w = 16;
@ -227,7 +227,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
// default value
problem_.r = 3;
}
if (!arg_as_int(problem_.s, "s", problem_space, problem)) {
// default value
problem_.s = 3;
@ -280,14 +280,14 @@ Status Conv2dOperationProfiler::initialize_configuration(
// cutlass profiler sets p and q which are cuDNN compliant. //
// //
////////////////////////////////////////////////////////////////////////////////////////
// set convolution output p
// set convolution output p
if (!arg_as_int(problem_.p, "p", problem_space, problem)) {
// default value (set using cudnn formula for output height, when p is not provided)
problem_.p = (
problem_.h +
2 * problem_.pad_h -
problem_.h +
2 * problem_.pad_h -
((problem_.r - 1) * problem_.dilation_h + 1)
) / (problem_.stride_h)
) / (problem_.stride_h)
+ 1;
}
@ -295,10 +295,10 @@ Status Conv2dOperationProfiler::initialize_configuration(
if (!arg_as_int(problem_.q, "q", problem_space, problem)) {
// default value (set using cudnn formula for output width, when q is not provided)
problem_.q = (
problem_.w +
2 * problem_.pad_w -
problem_.w +
2 * problem_.pad_w -
((problem_.s - 1) * problem_.dilation_w + 1)
) / (problem_.stride_w)
) / (problem_.stride_w)
+ 1;
}
/////////////////////////////////////////////////////////////////////////////////////////
@ -313,7 +313,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
// default value
problem_.split_k_slices = 1;
}
if (!arg_as_ConvModeID(problem_.conv_mode, "conv_mode", problem_space, problem)) {
// default value
problem_.conv_mode = library::ConvModeID::kCrossCorrelation;
@ -345,24 +345,24 @@ Status Conv2dOperationProfiler::initialize_configuration(
}
if (!arg_as_scalar(
problem_.alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
problem_.alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
problem)) {
if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) {
return Status::kErrorInternal;
}
}
if (!arg_as_scalar(
problem_.beta,
operation_desc.element_epilogue,
"beta",
problem_space,
problem_.beta,
operation_desc.element_epilogue,
"beta",
problem_space,
problem)) {
if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) {
return Status::kErrorInternal;
}
@ -389,7 +389,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
int(problem_.split_k_slices),
int(problem_.groups)
);
conv_workspace_.configuration.split_k_mode = static_cast<conv::SplitKMode>(static_cast<int>(problem_.split_k_mode));
conv_workspace_.set_stride_vector(
@ -420,7 +420,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
/// Initializes the performance result
void Conv2dOperationProfiler::initialize_result_(
PerformanceResult &result,
Options const &options,
Options const &options,
library::ConvDescription const &operation_desc,
ProblemSpace const &problem_space) {
@ -432,15 +432,15 @@ void Conv2dOperationProfiler::initialize_result_(
result.arguments.resize(problem_space.rank());
set_argument(result, "Activation", problem_space,
std::string(library::to_string(operation_desc.activation().element))
std::string(library::to_string(operation_desc.activation().element))
+ ":" + library::to_string(operation_desc.activation().layout));
set_argument(result, "Filter", problem_space,
std::string(library::to_string(operation_desc.filter().element))
std::string(library::to_string(operation_desc.filter().element))
+ ":" + library::to_string(operation_desc.filter().layout));
set_argument(result, "Output", problem_space,
std::string(library::to_string(operation_desc.output().element))
std::string(library::to_string(operation_desc.output().element))
+ ":" + library::to_string(operation_desc.output().layout));
set_argument(result, "conv_kind", problem_space, library::to_string(operation_desc.conv_kind));
@ -455,7 +455,7 @@ void Conv2dOperationProfiler::initialize_result_(
set_argument(result, "k", problem_space, problem_.k);
set_argument(result, "r", problem_space, problem_.r);
set_argument(result, "s", problem_space, problem_.s);
set_argument(result, "p", problem_space, problem_.p);
set_argument(result, "q", problem_space, problem_.q);
@ -470,11 +470,11 @@ void Conv2dOperationProfiler::initialize_result_(
set_argument(result, "dilation_h", problem_space, problem_.dilation_h);
set_argument(result, "dilation_w", problem_space, problem_.dilation_w);
set_argument(result, "split_k_mode", problem_space,
set_argument(result, "split_k_mode", problem_space,
std::string(library::to_string(problem_.split_k_mode)));
set_argument(result, "split_k_slices", problem_space, problem_.split_k_slices);
set_argument(result, "conv_mode", problem_space,
set_argument(result, "conv_mode", problem_space,
std::string(library::to_string(problem_.conv_mode)));
set_argument(result, "alpha", problem_space,
@ -483,19 +483,19 @@ void Conv2dOperationProfiler::initialize_result_(
set_argument(result, "beta", problem_space,
library::lexical_cast(problem_.beta, operation_desc.element_epilogue));
set_argument(result, "eq_gemm_provider", problem_space,
set_argument(result, "eq_gemm_provider", problem_space,
std::string(library::to_string(problem_.eq_gemm_provider)));
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
// Bytes of activation, filter, and output tensors
int64_t activation_bytes = int64_t(library::sizeof_bits(operation_desc.activation().element) / 8) *
int64_t activation_bytes = int64_t(library::sizeof_bits(operation_desc.activation().element) / 8) *
conv_workspace_.configuration.problem_size.activation_size();
int64_t filter_bytes = int64_t(library::sizeof_bits(operation_desc.filter().element) / 8) *
int64_t filter_bytes = int64_t(library::sizeof_bits(operation_desc.filter().element) / 8) *
conv_workspace_.configuration.problem_size.filter_size();
int64_t output_bytes = int64_t(library::sizeof_bits(operation_desc.output().element) / 8) *
int64_t output_bytes = int64_t(library::sizeof_bits(operation_desc.output().element) / 8) *
conv_workspace_.configuration.problem_size.output_size();
// Bytes of activation, filter, and output tensors
@ -511,14 +511,14 @@ void Conv2dOperationProfiler::initialize_result_(
/// Initialize reduction problem dimensions and library::Operation
bool Conv2dOperationProfiler::initialize_reduction_configuration_(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::ConvDescription const &conv_desc =
library::ConvDescription const &conv_desc =
static_cast<library::ConvDescription const &>(operation->description());
library::ConvKind const &conv_kind = conv_desc.conv_kind;
@ -545,14 +545,14 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
conv_workspace_.reduction_configuration.ldd =
conv_workspace_.configuration.stride_c[tensor_c_stride_idx];
// find reduction operation
// find reduction operation
library::ReductionFunctionalKey reduction_key(
library::Provider::kCUTLASS,
conv_desc.tile_description.math_instruction.element_accumulator, // element workspace
conv_desc.tile_description.math_instruction.element_accumulator, // element workspace
conv_desc.tile_description.math_instruction.element_accumulator, // element accumulator
conv_desc.C.element, // element output
conv_desc.element_epilogue // element compute
);
);
#if 0// debug print to check which reduction instance is selected
std::cout << reduction_key << "\n";
@ -562,7 +562,7 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
if(reduction_it == Singleton::get().operation_table.reduction_operations.end()) {
return false;
}
}
// initialize reduction operation required for parallel split-k conv2d operator
reduction_op_ = reduction_it->second;
@ -574,13 +574,24 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
/// Initializes workspace
Status Conv2dOperationProfiler::initialize_workspace(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
if (options.device.devices.size() != 1) {
throw std::runtime_error("This operation profiler only supports a single "
"device.");
}
cudaError_t result;
result = cudaSetDevice(options.device.device_id(0));
if (result != cudaSuccess) {
throw std::runtime_error("cudaSetDevice() failed.");
}
// initialize conv2d underlying operation to handle parallel reduction
library::Operation const* underlying_operation = operation;
@ -590,15 +601,15 @@ Status Conv2dOperationProfiler::initialize_workspace(
}
}
library::ConvDescription const &operation_desc =
library::ConvDescription const &operation_desc =
static_cast<library::ConvDescription const &>(underlying_operation->description());
// Compute the number of copies of the problem to avoid L2 camping.
if (!options.profiling.workspace_count) {
int64_t bytes = problem_.bytes(operation_desc);
if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
conv_workspace_.problem_count =
1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
}
else {
conv_workspace_.problem_count = 1;
@ -611,7 +622,7 @@ Status Conv2dOperationProfiler::initialize_workspace(
if (options.execution_mode != ExecutionMode::kDryRun) {
int seed_shift = 0;
conv_workspace_.A = device_context.allocate_tensor(
conv_workspace_.A = device_context.allocate_and_initialize_tensor(
options,
"A",
operation_desc.A.element,
@ -619,10 +630,11 @@ Status Conv2dOperationProfiler::initialize_workspace(
problem_.extent_a(operation_desc.conv_kind),
conv_workspace_.configuration.stride_a,
conv_workspace_.problem_count,
seed_shift++
seed_shift++,
0 // device_index
);
conv_workspace_.B = device_context.allocate_tensor(
conv_workspace_.B = device_context.allocate_and_initialize_tensor(
options,
"B",
operation_desc.B.element,
@ -630,12 +642,13 @@ Status Conv2dOperationProfiler::initialize_workspace(
problem_.extent_b(operation_desc.conv_kind),
conv_workspace_.configuration.stride_b,
conv_workspace_.problem_count,
seed_shift++
seed_shift++,
0 // device_index
);
if(problem_.groups == problem_.c && problem_.groups == problem_.k){
// Depthwise direct conv kernel needs reorder the filter.
conv_workspace_.reordered_B = device_context.allocate_tensor(
conv_workspace_.reordered_B = device_context.allocate_and_initialize_tensor(
options,
"B",
operation_desc.B.element,
@ -643,11 +656,12 @@ Status Conv2dOperationProfiler::initialize_workspace(
problem_.extent_b(operation_desc.conv_kind),
conv_workspace_.configuration.stride_b,
conv_workspace_.problem_count,
seed_shift++
seed_shift++,
0 // device_index
);
}
conv_workspace_.C = device_context.allocate_tensor(
conv_workspace_.C = device_context.allocate_and_initialize_tensor(
options,
"C",
operation_desc.C.element,
@ -655,25 +669,30 @@ Status Conv2dOperationProfiler::initialize_workspace(
problem_.extent_c(operation_desc.conv_kind),
conv_workspace_.configuration.stride_c,
conv_workspace_.problem_count,
seed_shift++
seed_shift++,
0 // device_index
);
conv_workspace_.Computed = device_context.allocate_tensor(
options,
"D",
operation_desc.C.element,
operation_desc.C.layout,
problem_.extent_c(operation_desc.conv_kind),
conv_workspace_.configuration.stride_c,
conv_workspace_.problem_count
conv_workspace_.problem_count,
0 // device_index
);
conv_workspace_.Reference = device_context.allocate_tensor(
options,
"Reference",
operation_desc.C.element,
operation_desc.C.layout,
problem_.extent_c(operation_desc.conv_kind),
conv_workspace_.configuration.stride_c,
conv_workspace_.problem_count
conv_workspace_.problem_count,
0 // device_index
);
}
@ -706,10 +725,10 @@ Status Conv2dOperationProfiler::initialize_workspace(
conv_workspace_.reduction_host_workspace.resize(workspace_size, 0);
status = reduction_op_->initialize(
&conv_workspace_.reduction_configuration,
conv_workspace_.reduction_host_workspace.data(),
&conv_workspace_.reduction_configuration,
conv_workspace_.reduction_host_workspace.data(),
nullptr);
if (status != Status::kSuccess) {
return status;
}
@ -736,7 +755,7 @@ Status Conv2dOperationProfiler::initialize_workspace(
/// Verifies CUTLASS against references
bool Conv2dOperationProfiler::verify_cutlass(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -769,7 +788,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
}
conv_workspace_.Computed->copy_from_device(conv_workspace_.C->data());
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
// update library::ConvArguments for parallel split-k reduction
conv_workspace_.arguments.D = conv_workspace_.device_workspace.data();
@ -799,9 +818,9 @@ bool Conv2dOperationProfiler::verify_cutlass(
}
#if 0
std::cout << "profiling : " << std::endl
<< "conv2d : " << operation->description().name << std::endl
<< "underlying conv2d : " << underlying_operation->description().name << std::endl
std::cout << "profiling : " << std::endl
<< "conv2d : " << operation->description().name << std::endl
<< "underlying conv2d : " << underlying_operation->description().name << std::endl
<< "reduction : " << reduction_op_->description().name << std::endl;
#endif
@ -818,7 +837,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
// Run parallel reduction kernel for parallel split_k_mode
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
results_.back().status = reduction_op_->run(
&conv_workspace_.reduction_arguments,
conv_workspace_.reduction_host_workspace.data(),
@ -840,7 +859,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
// CUTLASS op ran the but not yet verified against any verification provider
results_.back().disposition = Disposition::kNotVerified;
//
// Run verification providers
//
@ -856,7 +875,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
Status status = cudnn_satisfies(conv_desc, conv_workspace_.configuration);
// Initialize reference data to the source data
// Initialize reference data to the source data
conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
if (status == Status::kSuccess) {
@ -884,7 +903,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
// Run verification device reference
if (options.verification.provider_enabled(library::Provider::kReferenceDevice)) {
// Restore reference data back to initial source data
// Restore reference data back to initial source data
conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
verify_with_device_reference_(
@ -893,13 +912,13 @@ bool Conv2dOperationProfiler::verify_cutlass(
device_context,
operation,
problem_space,
problem);
problem);
}
// Run verification host reference
if (options.verification.provider_enabled(library::Provider::kReferenceHost)) {
// Restore reference data back to initial source data
// Restore reference data back to initial source data
conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
verify_with_host_reference_(
@ -908,10 +927,10 @@ bool Conv2dOperationProfiler::verify_cutlass(
device_context,
operation,
problem_space,
problem);
problem);
}
// Update disposition to worst case verification outcome among all
// Update disposition to worst case verification outcome among all
// verification providers which are supported
bool is_any_verification_run_passed = false;
for(auto &m : results_.back().verification_map) {
@ -936,7 +955,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
/// Verifies CUTLASS against host reference
bool Conv2dOperationProfiler::verify_with_host_reference_(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -954,14 +973,14 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
library::ConvFunctionalKey conv2d_key(
library::Provider::kReferenceHost,
conv_desc.conv_kind,
conv_desc.conv_kind,
conv_desc.A.element,
conv_desc.A.layout,
conv_desc.B.element,
conv_desc.B.layout,
conv_desc.C.element,
conv_desc.C.layout,
conv_desc.tile_description.math_instruction.element_accumulator,
conv_desc.tile_description.math_instruction.element_accumulator,
conv_desc.element_epilogue);
#if 0 // debug print to check which host reference instance is selected
@ -974,12 +993,12 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
return true;
}
}
// conv2d host reference minimum cc is 0 (CPU) and no iterator algorithm
library::ConvPreferenceKey preference_key(0, library::IteratorAlgorithmID::kNone);
auto cc_it = operators_it->second.find(preference_key);
if(cc_it == operators_it->second.end()) {
results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
return true;
@ -1052,9 +1071,9 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
);
// Save workspace if incorrect
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) {
save_workspace(
device_context,
options,
@ -1070,7 +1089,7 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
/// Verifies CUTLASS against host reference
bool Conv2dOperationProfiler::verify_with_device_reference_(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -1088,14 +1107,14 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
library::ConvFunctionalKey conv2d_key(
library::Provider::kReferenceDevice,
conv_desc.conv_kind,
conv_desc.conv_kind,
conv_desc.A.element,
conv_desc.A.layout,
conv_desc.B.element,
conv_desc.B.layout,
conv_desc.C.element,
conv_desc.C.layout,
conv_desc.tile_description.math_instruction.element_accumulator,
conv_desc.tile_description.math_instruction.element_accumulator,
conv_desc.element_epilogue);
auto operators_it = Singleton::get().operation_table.conv2d_operations.find(conv2d_key);
@ -1105,12 +1124,12 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotRun;
return true;
}
}
// conv2d device reference minimum cc is 50 and no iterator algorithm
library::ConvPreferenceKey preference_key(50, library::IteratorAlgorithmID::kNone);
auto cc_it = operators_it->second.find(preference_key);
if(cc_it == operators_it->second.end()) {
results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotRun;
@ -1119,7 +1138,7 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
// device reference has only one instances in Conv2dOperationVectorMap
library::Operation const *reference_op = cc_it->second[0];
//
// Initialize device reference operation
//
@ -1166,9 +1185,9 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
);
// Save workspace if incorrect
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
results_.back().verification_map[library::Provider::kReferenceDevice] == Disposition::kIncorrect) {
save_workspace(
device_context,
options,
@ -1183,14 +1202,14 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
/// Measures performance results
bool Conv2dOperationProfiler::profile(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
// Initialize structure containing Conv2d arguments
@ -1242,7 +1261,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
GpuTimer timer;
// initialize conv2d underlying operation to handle parallel reduction
library::Operation const* underlying_operation = operation;
library::Operation const* underlying_operation = operation;
library::ConvArguments *conv_arguments = static_cast<library::ConvArguments *>(arguments);
@ -1274,7 +1293,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
conv_arguments->B = conv_workspace_.B->batch_data(problem_idx);
conv_arguments->C = conv_workspace_.C->batch_data(problem_idx);
conv_arguments->D = conv_workspace_.Computed->batch_data(problem_idx);
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
// update library::ConvArguments for parallel split-k reduction
conv_arguments->D = conv_workspace_.device_workspace.data();
@ -1304,7 +1323,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
return status;
}
}
//
// Initialize GPU timer
//
@ -1319,7 +1338,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
int iteration = 0;
for (; iteration < Iterations; ++iteration) {
// Setup rotating workspace
int problem_idx = (iteration % conv_workspace_.problem_count);
@ -1345,7 +1364,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
device_workspace);
// Run parallel reduction kernel for parallel split_k_mode
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
status = reduction_op_->run(
&conv_workspace_.reduction_arguments,
@ -1367,7 +1386,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
//
// Update performance result
//
runtime = timer.duration(iteration);
return status;
@ -1378,13 +1397,13 @@ Status Conv2dOperationProfiler::profile_cutlass_(
/// Verifies CUTLASS against cudnn reference
bool Conv2dOperationProfiler::verify_with_cudnn_(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
auto &conv_desc = static_cast<library::ConvDescription const &>(operation->description());
//
@ -1395,7 +1414,7 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
cudnnStatus_t status = handle.get_cudnn_create_status();
if (status != CUDNN_STATUS_SUCCESS) {
results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status);
return true;
}
@ -1411,7 +1430,7 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
conv_workspace_.arguments.alpha = problem_.alpha.data();
conv_workspace_.arguments.beta = problem_.beta.data();
conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
// cuDNN does not support four tensor arguments, so we copy the tensor C data into
// tensor D.
conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
@ -1423,8 +1442,8 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
// Construct dispatcher to cudnn operator
//
detail::cudnnConvDispatcher conv_op(
conv_desc,
detail::cudnnConvDispatcher conv_op(
conv_desc,
conv_workspace_.configuration,
conv_workspace_.arguments,
handle
@ -1462,7 +1481,7 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
);
// Save workspace if incorrect
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
results_.back().verification_map[library::Provider::kCUDNN] == Disposition::kIncorrect) {
save_workspace(

View File

@ -52,10 +52,10 @@ namespace profiler {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Ctor
Conv3dOperationProfiler::Conv3dOperationProfiler(Options const &options):
Conv3dOperationProfiler::Conv3dOperationProfiler(Options const &options):
OperationProfiler(
options,
library::OperationKind::kConv3d,
library::OperationKind::kConv3d,
{
{ArgumentTypeID::kEnumerated, {"conv_kind"}, "Convolutional operator (fprop, dgrad, wgrad)"},
{ArgumentTypeID::kInteger, {"n", "input_n"}, "Input N dimension of the Conv3d problem space"},
@ -170,7 +170,7 @@ int64_t Conv3dOperationProfiler::Conv3dProblem::flops(
int64_t flops_mainloop_ = int64_t(mnk.m()) * mnk.n() * mnk.k() * 2;
int64_t flops_epilogue_ = int64_t(mnk.m()) * int64_t(mnk.n()) * 2;
// Adjust mainloop flop for dgrad strided
if (operation_desc.conv_kind == library::ConvKind::kDgrad) {
flops_mainloop_ = flops_mainloop_ / ( stride_d * stride_h * stride_w);
@ -183,14 +183,14 @@ int64_t Conv3dOperationProfiler::Conv3dProblem::flops(
/// Extracts the problem dimensions
Status Conv3dOperationProfiler::initialize_configuration(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::ConvDescription const &operation_desc =
library::ConvDescription const &operation_desc =
static_cast<library::ConvDescription const &>(operation->description());
if (!arg_as_int(problem_.n, "n", problem_space, problem)) {
@ -207,7 +207,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
// default value
problem_.h = 14;
}
if (!arg_as_int(problem_.w, "w", problem_space, problem)) {
// default value
problem_.w = 14;
@ -232,7 +232,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
// default value
problem_.r = 3;
}
if (!arg_as_int(problem_.s, "s", problem_space, problem)) {
// default value
problem_.s = 3;
@ -294,25 +294,25 @@ Status Conv3dOperationProfiler::initialize_configuration(
// cutlass profiler sets p and q which are cuDNN compliant. //
// //
////////////////////////////////////////////////////////////////////////////////////////
// set convolution output z
// set convolution output z
if (!arg_as_int(problem_.z, "z", problem_space, problem)) {
// default value (set using cudnn formula for output height, when p is not provided)
problem_.z = (
problem_.d +
2 * problem_.pad_d -
problem_.d +
2 * problem_.pad_d -
((problem_.t - 1) * problem_.dilation_d + 1)
) / (problem_.stride_d)
) / (problem_.stride_d)
+ 1;
}
// set convolution output p
// set convolution output p
if (!arg_as_int(problem_.p, "p", problem_space, problem)) {
// default value (set using cudnn formula for output height, when p is not provided)
problem_.p = (
problem_.h +
2 * problem_.pad_h -
problem_.h +
2 * problem_.pad_h -
((problem_.r - 1) * problem_.dilation_h + 1)
) / (problem_.stride_h)
) / (problem_.stride_h)
+ 1;
}
@ -320,10 +320,10 @@ Status Conv3dOperationProfiler::initialize_configuration(
if (!arg_as_int(problem_.q, "q", problem_space, problem)) {
// default value (set using cudnn formula for output width, when q is not provided)
problem_.q = (
problem_.w +
2 * problem_.pad_w -
problem_.w +
2 * problem_.pad_w -
((problem_.s - 1) * problem_.dilation_w + 1)
) / (problem_.stride_w)
) / (problem_.stride_w)
+ 1;
}
/////////////////////////////////////////////////////////////////////////////////////////
@ -338,7 +338,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
// default value
problem_.split_k_slices = 1;
}
if (!arg_as_ConvModeID(problem_.conv_mode, "conv_mode", problem_space, problem)) {
// default value
problem_.conv_mode = library::ConvModeID::kCrossCorrelation;
@ -370,24 +370,24 @@ Status Conv3dOperationProfiler::initialize_configuration(
}
if (!arg_as_scalar(
problem_.alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
problem_.alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
problem)) {
if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) {
return Status::kErrorInternal;
}
}
if (!arg_as_scalar(
problem_.beta,
operation_desc.element_epilogue,
"beta",
problem_space,
problem_.beta,
operation_desc.element_epilogue,
"beta",
problem_space,
problem)) {
if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) {
return Status::kErrorInternal;
}
@ -420,25 +420,25 @@ Status Conv3dOperationProfiler::initialize_configuration(
int(problem_.split_k_slices),
1 // groups
);
conv_workspace_.configuration.split_k_mode = static_cast<conv::SplitKMode>(static_cast<int>(problem_.split_k_mode));
conv_workspace_.configuration.layout_activations.stride() = make_Coord(
int(problem_.c),
int(problem_.c),
int(problem_.w) * int(problem_.c),
int(problem_.h) * int(problem_.w) * int(problem_.c),
int(problem_.d) * int(problem_.h) * int(problem_.w) * int(problem_.c)
);
conv_workspace_.configuration.layout_filters.stride() = make_Coord(
int(problem_.c),
int(problem_.c),
int(problem_.s) * int(problem_.c),
int(problem_.r) * int(problem_.s) * int(problem_.c),
int(problem_.t) * int(problem_.r) * int(problem_.s) * int(problem_.c)
);
conv_workspace_.configuration.layout_output.stride() = make_Coord(
int(problem_.k),
int(problem_.k),
int(problem_.q) * int(problem_.k),
int(problem_.q) * int(problem_.p) * int(problem_.k),
int(problem_.z) * int(problem_.q) * int(problem_.p) * int(problem_.k)
@ -469,7 +469,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
/// Initializes the performance result
void Conv3dOperationProfiler::initialize_result_(
PerformanceResult &result,
Options const &options,
Options const &options,
library::ConvDescription const &operation_desc,
ProblemSpace const &problem_space) {
@ -481,15 +481,15 @@ void Conv3dOperationProfiler::initialize_result_(
result.arguments.resize(problem_space.rank());
set_argument(result, "Activation", problem_space,
std::string(library::to_string(operation_desc.activation().element))
std::string(library::to_string(operation_desc.activation().element))
+ ":" + library::to_string(operation_desc.activation().layout));
set_argument(result, "Filter", problem_space,
std::string(library::to_string(operation_desc.filter().element))
std::string(library::to_string(operation_desc.filter().element))
+ ":" + library::to_string(operation_desc.filter().layout));
set_argument(result, "Output", problem_space,
std::string(library::to_string(operation_desc.output().element))
std::string(library::to_string(operation_desc.output().element))
+ ":" + library::to_string(operation_desc.output().layout));
set_argument(result, "conv_kind", problem_space, library::to_string(operation_desc.conv_kind));
@ -506,7 +506,7 @@ void Conv3dOperationProfiler::initialize_result_(
set_argument(result, "t", problem_space, problem_.t);
set_argument(result, "r", problem_space, problem_.r);
set_argument(result, "s", problem_space, problem_.s);
set_argument(result, "z", problem_space, problem_.z);
set_argument(result, "p", problem_space, problem_.p);
set_argument(result, "q", problem_space, problem_.q);
@ -523,11 +523,11 @@ void Conv3dOperationProfiler::initialize_result_(
set_argument(result, "dilation_h", problem_space, problem_.dilation_h);
set_argument(result, "dilation_w", problem_space, problem_.dilation_w);
set_argument(result, "split_k_mode", problem_space,
set_argument(result, "split_k_mode", problem_space,
std::string(library::to_string(problem_.split_k_mode)));
set_argument(result, "split_k_slices", problem_space, problem_.split_k_slices);
set_argument(result, "conv_mode", problem_space,
set_argument(result, "conv_mode", problem_space,
std::string(library::to_string(problem_.conv_mode)));
set_argument(result, "alpha", problem_space,
@ -536,7 +536,7 @@ void Conv3dOperationProfiler::initialize_result_(
set_argument(result, "beta", problem_space,
library::lexical_cast(problem_.beta, operation_desc.element_epilogue));
set_argument(result, "eq_gemm_provider", problem_space,
set_argument(result, "eq_gemm_provider", problem_space,
std::string(library::to_string(problem_.eq_gemm_provider)));
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
@ -554,14 +554,14 @@ void Conv3dOperationProfiler::initialize_result_(
/// Initialize reduction problem dimensions and library::Operation
bool Conv3dOperationProfiler::initialize_reduction_configuration_(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::ConvDescription const &conv_desc =
library::ConvDescription const &conv_desc =
static_cast<library::ConvDescription const &>(operation->description());
library::ConvKind const &conv_kind = conv_desc.conv_kind;
@ -585,14 +585,14 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(
conv_workspace_.reduction_configuration.lds = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
conv_workspace_.reduction_configuration.ldd = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
// find reduction operation
// find reduction operation
library::ReductionFunctionalKey reduction_key(
library::Provider::kCUTLASS,
conv_desc.tile_description.math_instruction.element_accumulator, // element workspace
conv_desc.tile_description.math_instruction.element_accumulator, // element workspace
conv_desc.tile_description.math_instruction.element_accumulator, // element accumulator
conv_desc.C.element, // element output
conv_desc.element_epilogue // element compute
);
);
#if 0// debug print to check which reduction instance is selected
std::cout << reduction_key << "\n";
@ -602,7 +602,7 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(
if(reduction_it == Singleton::get().operation_table.reduction_operations.end()) {
return false;
}
}
// initialize reduction operation required for parallel split-k conv2d operator
reduction_op_ = reduction_it->second;
@ -614,13 +614,24 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(
/// Initializes workspace
Status Conv3dOperationProfiler::initialize_workspace(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
if (options.device.devices.size() != 1) {
throw std::runtime_error("This operation profiler only supports a single "
"device.");
}
cudaError_t result;
result = cudaSetDevice(options.device.device_id(0));
if (result != cudaSuccess) {
throw std::runtime_error("cudaSetDevice() failed.");
}
// initialize conv2d underlying operation to handle parallel reduction
library::Operation const* underlying_operation = operation;
@ -630,15 +641,15 @@ Status Conv3dOperationProfiler::initialize_workspace(
}
}
library::ConvDescription const &operation_desc =
library::ConvDescription const &operation_desc =
static_cast<library::ConvDescription const &>(underlying_operation->description());
// Compute the number of copies of the problem to avoid L2 camping.
if (!options.profiling.workspace_count) {
int64_t bytes = problem_.bytes(operation_desc);
if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
conv_workspace_.problem_count =
1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
}
else {
conv_workspace_.problem_count = 1;
@ -651,7 +662,7 @@ Status Conv3dOperationProfiler::initialize_workspace(
if (options.execution_mode != ExecutionMode::kDryRun) {
int seed_shift = 0;
conv_workspace_.A = device_context.allocate_tensor(
conv_workspace_.A = device_context.allocate_and_initialize_tensor(
options,
"A",
operation_desc.A.element,
@ -659,10 +670,11 @@ Status Conv3dOperationProfiler::initialize_workspace(
problem_.extent_a(operation_desc.conv_kind),
conv_workspace_.stride_a(operation_desc.conv_kind),
conv_workspace_.problem_count,
seed_shift++
seed_shift++,
0 // device_index
);
conv_workspace_.B = device_context.allocate_tensor(
conv_workspace_.B = device_context.allocate_and_initialize_tensor(
options,
"B",
operation_desc.B.element,
@ -670,10 +682,11 @@ Status Conv3dOperationProfiler::initialize_workspace(
problem_.extent_b(operation_desc.conv_kind),
conv_workspace_.stride_b(operation_desc.conv_kind),
conv_workspace_.problem_count,
seed_shift++
seed_shift++,
0 // device_index
);
conv_workspace_.C = device_context.allocate_tensor(
conv_workspace_.C = device_context.allocate_and_initialize_tensor(
options,
"C",
operation_desc.C.element,
@ -681,27 +694,32 @@ Status Conv3dOperationProfiler::initialize_workspace(
problem_.extent_c(operation_desc.conv_kind),
conv_workspace_.stride_c(operation_desc.conv_kind),
conv_workspace_.problem_count,
seed_shift++
seed_shift++,
0 // device_index
);
conv_workspace_.Computed = device_context.allocate_tensor(
options,
"D",
operation_desc.C.element,
operation_desc.C.layout,
problem_.extent_c(operation_desc.conv_kind),
conv_workspace_.stride_c(operation_desc.conv_kind),
conv_workspace_.problem_count
conv_workspace_.problem_count,
0 // device_index
);
conv_workspace_.Reference = device_context.allocate_tensor(
options,
"Reference",
operation_desc.C.element,
operation_desc.C.layout,
problem_.extent_c(operation_desc.conv_kind),
conv_workspace_.stride_c(operation_desc.conv_kind),
conv_workspace_.problem_count
conv_workspace_.problem_count,
0 // device_index
);
}
//
@ -733,10 +751,10 @@ Status Conv3dOperationProfiler::initialize_workspace(
conv_workspace_.reduction_host_workspace.resize(workspace_size, 0);
status = reduction_op_->initialize(
&conv_workspace_.reduction_configuration,
conv_workspace_.reduction_host_workspace.data(),
&conv_workspace_.reduction_configuration,
conv_workspace_.reduction_host_workspace.data(),
nullptr);
if (status != Status::kSuccess) {
return status;
}
@ -763,7 +781,7 @@ Status Conv3dOperationProfiler::initialize_workspace(
/// Verifies CUTLASS against references
bool Conv3dOperationProfiler::verify_cutlass(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -784,7 +802,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
set_cutlass_operator_arguments_();
conv_workspace_.Computed->copy_from_device(conv_workspace_.C->data());
//
// Run the CUTLASS operation
//
@ -799,9 +817,9 @@ bool Conv3dOperationProfiler::verify_cutlass(
}
#if 0
std::cout << "profiling : " << std::endl
<< "conv2d : " << operation->description().name << std::endl
<< "underlying conv2d : " << underlying_operation->description().name << std::endl
std::cout << "profiling : " << std::endl
<< "conv2d : " << operation->description().name << std::endl
<< "underlying conv2d : " << underlying_operation->description().name << std::endl
<< "reduction : " << reduction_op_->description().name << std::endl;
#endif
@ -818,7 +836,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
// Run parallel reduction kernel for parallel split_k_mode
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
results_.back().status = reduction_op_->run(
&conv_workspace_.reduction_arguments,
conv_workspace_.reduction_host_workspace.data(),
@ -840,7 +858,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
// CUTLASS op ran the but not yet verified against any verification provider
results_.back().disposition = Disposition::kNotVerified;
//
// Run verification providers
//
@ -856,7 +874,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
Status status = cudnn_satisfies(conv_desc, conv_workspace_.configuration);
// Initialize reference data to the source data
// Initialize reference data to the source data
conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
if (status == Status::kSuccess) {
@ -883,8 +901,8 @@ bool Conv3dOperationProfiler::verify_cutlass(
// Run verification host reference
if (options.verification.provider_enabled(library::Provider::kReferenceHost)) {
// Restore reference data back to initial source data
// Restore reference data back to initial source data
conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
verify_with_host_reference_(
@ -893,10 +911,10 @@ bool Conv3dOperationProfiler::verify_cutlass(
device_context,
operation,
problem_space,
problem);
problem);
}
// Update disposition to worst case verification outcome among all
// Update disposition to worst case verification outcome among all
// verification providers which are supported
bool is_any_verification_run_passed = false;
for(auto &m : results_.back().verification_map) {
@ -921,7 +939,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
/// Verifies CUTLASS against host reference
bool Conv3dOperationProfiler::verify_with_host_reference_(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -939,14 +957,14 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
library::ConvFunctionalKey conv_key(
library::Provider::kReferenceHost,
conv_desc.conv_kind,
conv_desc.conv_kind,
conv_desc.A.element,
conv_desc.A.layout,
conv_desc.B.element,
conv_desc.B.layout,
conv_desc.C.element,
conv_desc.C.layout,
conv_desc.tile_description.math_instruction.element_accumulator,
conv_desc.tile_description.math_instruction.element_accumulator,
conv_desc.element_epilogue);
#if 0 // debug print to check which host reference instance is selected
@ -959,12 +977,12 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
return true;
}
}
// conv3d host reference minimum cc is 0 (CPU) and no iterator algorithm
library::ConvPreferenceKey preference_key(0, library::IteratorAlgorithmID::kNone);
auto cc_it = operators_it->second.find(preference_key);
if(cc_it == operators_it->second.end()) {
results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
return true;
@ -1035,9 +1053,9 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
);
// Save workspace if incorrect
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) {
save_workspace(
device_context,
options,
@ -1053,7 +1071,7 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
/// Verifies CUTLASS against host reference
bool Conv3dOperationProfiler::verify_with_device_reference_(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -1068,14 +1086,14 @@ bool Conv3dOperationProfiler::verify_with_device_reference_(
/// Measures performance results
bool Conv3dOperationProfiler::profile(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
set_cutlass_operator_arguments_();
@ -1180,7 +1198,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
return status;
}
}
//
// Initialize GPU timer
//
@ -1198,9 +1216,9 @@ Status Conv3dOperationProfiler::profile_cutlass_(
// Setup rotating workspace
int problem_idx = (iteration % conv_workspace_.problem_count);
set_cutlass_operator_arguments_(problem_idx);
// Run underlying conv2d operation
status = underlying_operation->run(
arguments,
@ -1208,7 +1226,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
device_workspace);
// Run parallel reduction kernel for parallel split_k_mode
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
status = reduction_op_->run(
&conv_workspace_.reduction_arguments,
conv_workspace_.reduction_host_workspace.data(),
@ -1229,7 +1247,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
//
// Update performance result
//
runtime = timer.duration(iteration);
return status;
@ -1240,7 +1258,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
/// Verifies CUTLASS against cudnn reference
bool Conv3dOperationProfiler::verify_with_cudnn_(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -1257,7 +1275,7 @@ bool Conv3dOperationProfiler::verify_with_cudnn_(
cudnnStatus_t status = handle.get_cudnn_create_status();
if (status != CUDNN_STATUS_SUCCESS) {
results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status);
return true;
}
@ -1285,8 +1303,8 @@ bool Conv3dOperationProfiler::verify_with_cudnn_(
// Construct dispatcher to cudnn operator
//
detail::cudnnConvDispatcher conv_op(
conv_desc,
detail::cudnnConvDispatcher conv_op(
conv_desc,
conv_workspace_.configuration,
conv_workspace_.arguments,
handle
@ -1323,7 +1341,7 @@ bool Conv3dOperationProfiler::verify_with_cudnn_(
);
// Save workspace if incorrect
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
results_.back().verification_map[library::Provider::kCUDNN] == Disposition::kIncorrect) {
save_workspace(

View File

@ -259,6 +259,25 @@ Status cublas_satisfies(library::GemmDescription const &desc) {
return Status::kErrorNotSupported;
}
// Refer to https://docs.nvidia.com/cuda/cublas/#id105
// input type A and B FE5M2 not supported in cuBLASLt
if(desc.A.element == library::NumericTypeID::kFE5M2 &&
desc.B.element == library::NumericTypeID::kFE5M2){
return Status::kErrorNotSupported;
}
// Refer to https://docs.nvidia.com/cuda/cublas/#id105
// input type A and B are FE5M2 and FE4M3 then D type should be F32
if (desc.A.element == library::NumericTypeID::kFE5M2 &&
desc.B.element == library::NumericTypeID::kFE4M3 &&
desc.C.element == library::NumericTypeID::kF32 &&
desc.D.element != library::NumericTypeID::kF32 ){
return Status::kErrorNotSupported;
}
// output type S4 and S8 not supported in cuBLAS
if (desc.C.element == library::NumericTypeID::kS4 ||
desc.C.element == library::NumericTypeID::kS8) {
@ -405,7 +424,261 @@ cublasStatus_t cublasGemmExDispatcher::operator()(cublasHandle_t handle) {
}
}
} // namespace detail
cublasLtGemmExDispatcher::cublasLtGemmExDispatcher(
library::GemmDescription const &op_desc,
library::GemmUniversalConfiguration configuration_,
library::GemmUniversalArguments arguments_
):
op_desc(op_desc), configuration(configuration_), arguments(arguments_), status(Status::kSuccess) {
bool good = true;
good = (good && get_cublas_transpose_operation(trans_A, op_desc.A.layout, op_desc.transform_A));
good = (good && get_cublas_transpose_operation(trans_B, op_desc.B.layout, op_desc.transform_B));
good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
good = (good && get_cublas_datatype(data_type_B, op_desc.B.element));
good = (good && get_cublas_datatype(data_type_C, op_desc.C.element));
good = (good && get_cublas_datatype(
compute_data_type,
op_desc.tile_description.math_instruction.element_accumulator));
// cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe
// internal numerical data types used in the computation.
#if (__CUDACC_VER_MAJOR__ >= 11)
library::OpcodeClassID const & opcode_class =
op_desc.tile_description.math_instruction.opcode_class;
if (good &&
op_desc.A.element == library::NumericTypeID::kF32 &&
op_desc.B.element == library::NumericTypeID::kF32 &&
opcode_class == library::OpcodeClassID::kTensorOp) {
compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
}
else if (good) {
bool const isPedantic = false;
switch (compute_data_type) {
case CUDA_R_32F:
case CUDA_C_32F:
compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
break;
case CUDA_R_64F:
case CUDA_C_64F:
compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
break;
case CUDA_R_16F:
compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
break;
case CUDA_R_32I:
compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
break;
default:
good = false;
break;
}
}
#endif // __CUDACC_VER_MAJOR__ >= 11
if (!good) {
status = Status::kErrorNotSupported;
}
}
void cublasLtGemmExDispatcher::initialize_cublaslt(){
// create operation desciriptor; see cublasLtMatmulDescAttributes_t for details about defaults; here we just need to
// set the transforms for A and B
cublasLtMatmulDescCreate(&operationDesc, compute_type, compute_data_type);
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_A, sizeof(trans_A));
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_B, sizeof(trans_B));
uint64_t contiguous_A = (trans_A == CUBLAS_OP_N ? configuration.problem_size.m() : configuration.problem_size.k());
uint64_t strided_A = (trans_A == CUBLAS_OP_N ? configuration.problem_size.k() : configuration.problem_size.m());
uint64_t contiguous_B = (trans_B == CUBLAS_OP_N ? configuration.problem_size.k() : configuration.problem_size.n());
uint64_t strided_B = (trans_B == CUBLAS_OP_N ? configuration.problem_size.n() : configuration.problem_size.k());
// create matrix descriptors, we are good with the details here so no need to set any extra attributes
// table of supported type combinations can be found in the documentation: https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmul
cublasLtMatrixLayoutCreate(&Adesc, data_type_A, contiguous_A, strided_A, configuration.lda);
cublasLtMatrixLayoutCreate(&Bdesc, data_type_B, contiguous_B, strided_B, configuration.ldb);
cublasLtMatrixLayoutCreate(&Cdesc, data_type_C, configuration.problem_size.m(), configuration.problem_size.n(), configuration.ldc);
cublasLtMatrixLayoutCreate(&Ddesc, data_type_C, configuration.problem_size.m(), configuration.problem_size.n(), configuration.ldd);
}
bool cublasLtGemmExDispatcher::get_cublaslt_algo(cublasLtHandle_t handle,
AlgorithmMode algorithm_mode
){
const int requestedAlgoCount = 8; //By default gets 8 algorithms from GetHeuristic Call. CublasLt heuristics provide at max 8 algorithms.
int returnedResults = 0;
cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {};
#if (__CUDACC_VER_MAJOR__ >= 12)
//Decide based upon the unique operation identifier whether to turn on fast accum for cublas kernel or not.
std::string operation_name(op_desc.name);
if(operation_name.find("fastaccum") != std::string::npos){
const int8_t fastAccuMode = 1;
cublasLtMatmulDescSetAttribute(operationDesc,
CUBLASLT_MATMUL_DESC_FAST_ACCUM,
&fastAccuMode,
sizeof(fastAccuMode));
}
#endif // __CUDACC_VER_MAJOR__ >= 12
//Using 32MB for hopper kernel. This is the max workspace size for the call to cublasLtMatmulAlgoGetHeuristic()
size_t workspaceSizeForHeuristics = 32ULL * 1024 * 1024;
void* workspaceHeuristic = nullptr;
cudaError_t result = cudaMalloc((void **)&workspaceHeuristic, workspaceSizeForHeuristics);
if (result != cudaSuccess) {
throw std::bad_alloc();
}
// create preference handle; here we could use extra attributes to disable tensor ops or to make sure algo selected
// will work with badly aligned A, B, C; here for simplicity we just assume A,B,C are always well aligned (e.g.
// directly come from cudaMalloc)
cublasLtMatmulPreferenceCreate(&preference);
cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSizeForHeuristics, sizeof(workspaceSizeForHeuristics));
cublasLtMatmulAlgoGetHeuristic(handle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, preference, requestedAlgoCount, heuristicResult, &returnedResults);
if (returnedResults == 0) {
return false;
}
int bestAlgoIdx = 0;
//
//Auto Tuning to get the best kernel for the given problem
//
if (algorithm_mode == AlgorithmMode::kBest) {
float time = 0;
float bestAlgoTime = 0;
cudaStream_t stream;
cudaEvent_t startEvent, stopEvent;
cudaStreamCreate(&stream);
cudaEventCreate(&startEvent);
cudaEventCreate(&stopEvent);
constexpr int repeatAlgoCheck = 5;
std::vector<float> algoTimes(repeatAlgoCheck);
for (int algoIdx = 0; algoIdx < returnedResults; algoIdx++) {
for (int checkIdx = 0; checkIdx < repeatAlgoCheck; checkIdx++) {
cudaEventRecord(startEvent, stream);
cublasStatus_t status = cublasLtMatmul(handle,
operationDesc,
arguments.alpha,
arguments.A,
Adesc,
arguments.B,
Bdesc,
arguments.beta,
arguments.C,
Cdesc,
arguments.D,
Ddesc,
&heuristicResult[algoIdx].algo,
workspaceHeuristic,
heuristicResult[algoIdx].workspaceSize,
stream);
// Handle errors
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "cublasLtMatmul AutoTuning failed with status: " << cublasLtGetStatusName(status) << std::endl;
return false;
}
cudaEventRecord(stopEvent, stream);
cudaEventSynchronize(stopEvent);
cudaEventElapsedTime(&time, startEvent, stopEvent);
algoTimes[checkIdx] = time;
}
const size_t size = algoTimes.size();
if (size == 0) {
time = 0;
}
std::sort(algoTimes.begin(), algoTimes.end());
const size_t mid = size / 2;
if (size % 2 == 0) {
time = (algoTimes[mid] + algoTimes[mid - 1]) / 2;
}
else {
time = algoTimes[mid];
}
if (algoIdx == 0 || time < bestAlgoTime) {
bestAlgoTime = time;
bestAlgoIdx = algoIdx;
}
}
#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
std::cout << "\n";
std::cout << "# Algorithms checked: " << returnedResults << "\n";
std::cout << "WorkspaceSize Allocated: " << heuristicResult[bestAlgoIdx].workspaceSize << "\n";
std::cout << "Algorithm selected after auto-tuning is:" << "\n";
int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme;
cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
printf("algo={ Id=%d, tileIdx=%d splitK=%d reduc=%d swizzle=%d custom=%d }\n",
algoId, tile, numSplitsK, reductionScheme, swizzle, customOption);
#endif
if (stream) cudaStreamDestroy(stream);
if (startEvent) cudaEventDestroy(startEvent);
if (stopEvent) cudaEventDestroy(stopEvent);
}
//setting algorithm for the dispatcher
heuristicResult_ = heuristicResult[bestAlgoIdx];
result = cudaMalloc((void **)&workspace, heuristicResult_.workspaceSize);
if (result != cudaSuccess) {
throw std::bad_alloc();
}
return true;
}
cublasStatus_t cublasLtGemmExDispatcher::operator()(cublasLtHandle_t handle)
{
return cublasLtMatmul(handle,
operationDesc,
arguments.alpha,
arguments.A,
Adesc,
arguments.B,
Bdesc,
arguments.beta,
arguments.C,
Cdesc,
arguments.D,
Ddesc,
&heuristicResult_.algo,
workspace,
heuristicResult_.workspaceSize,
0); //number of streams is set to 0
}
}
// namespace detail
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -208,19 +208,6 @@ void CutlassProfiler::print_options_(std::ostream &out) {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Initializes the CUDA device
void CutlassProfiler::initialize_device_() {
cudaError_t result = cudaSetDevice(options_.device.device);
if (result != cudaSuccess) {
std::cerr << "Failed to set device.";
throw std::runtime_error("Failed to set device");
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace profiler
} // namespace cutlass

View File

@ -88,16 +88,16 @@ static std::vector<int64_t> get_packed_layout_stride(std::vector<int> const &ext
/// Returns the stride of a packed layout
std::vector<int64_t> DeviceAllocation::get_packed_layout(
library::LayoutTypeID layout_id,
library::LayoutTypeID layout_id,
std::vector<int> const &extent) {
std::vector<int64_t> stride;
switch (layout_id) {
case library::LayoutTypeID::kColumnMajor:
case library::LayoutTypeID::kColumnMajor:
stride = get_packed_layout_stride<cutlass::layout::ColumnMajor>(extent);
break;
case library::LayoutTypeID::kRowMajor:
case library::LayoutTypeID::kRowMajor:
stride = get_packed_layout_stride<cutlass::layout::RowMajor>(extent);
break;
case library::LayoutTypeID::kColumnMajorInterleavedK2:
@ -159,7 +159,7 @@ std::vector<int64_t> DeviceAllocation::get_packed_layout(
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Template to use CUTLASS Layout functions to
/// Template to use CUTLASS Layout functions to
template <typename Layout>
static size_t construct_layout_(
void *bytes,
@ -177,8 +177,8 @@ static size_t construct_layout_(
stride = get_packed_layout_stride<Layout>(extent);
return construct_layout_<Layout>(
bytes,
layout_id,
bytes,
layout_id,
extent,
stride);
}
@ -202,7 +202,7 @@ static size_t construct_layout_(
// Pack it into bytes
if (bytes) {
*reinterpret_cast<Layout *>(bytes) = layout;
*reinterpret_cast<Layout *>(bytes) = layout;
}
// Return capacity
@ -219,10 +219,10 @@ size_t DeviceAllocation::construct_layout(
std::vector<int64_t> &stride) {
switch (layout_id) {
case library::LayoutTypeID::kColumnMajor:
case library::LayoutTypeID::kColumnMajor:
return construct_layout_<cutlass::layout::ColumnMajor>(bytes, layout_id, extent, stride);
case library::LayoutTypeID::kRowMajor:
case library::LayoutTypeID::kRowMajor:
return construct_layout_<cutlass::layout::RowMajor>(bytes, layout_id, extent, stride);
case library::LayoutTypeID::kColumnMajorInterleavedK2:
@ -284,24 +284,26 @@ size_t DeviceAllocation::construct_layout(
/////////////////////////////////////////////////////////////////////////////////////////////////
DeviceAllocation::DeviceAllocation():
type_(library::NumericTypeID::kInvalid),
DeviceAllocation::DeviceAllocation():
type_(library::NumericTypeID::kInvalid),
batch_stride_(0),
capacity_(0),
capacity_(0),
pointer_(nullptr),
layout_(library::LayoutTypeID::kUnknown),
batch_count_(1) {
batch_count_(1),
device_(-1) {
}
DeviceAllocation::DeviceAllocation(
library::NumericTypeID type,
size_t capacity
library::NumericTypeID type,
size_t capacity,
int device
):
type_(type), batch_stride_(capacity), capacity_(capacity), pointer_(nullptr),
layout_(library::LayoutTypeID::kUnknown), batch_count_(1) {
type_(type), batch_stride_(capacity), capacity_(capacity), pointer_(nullptr),
layout_(library::LayoutTypeID::kUnknown), batch_count_(1), device_(device) {
cudaError_t result = cudaMalloc((void **)&pointer_, bytes(type, capacity));
cudaError_t result = this->malloc((void **)&pointer_, bytes(type, capacity));
if (result != cudaSuccess) {
type_ = library::NumericTypeID::kInvalid;
@ -312,13 +314,15 @@ DeviceAllocation::DeviceAllocation(
}
DeviceAllocation::DeviceAllocation(
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int64_t> const &stride,
int batch_count
int batch_count,
int device
):
type_(type), batch_stride_(size_t(0)), capacity_(size_t(0)), pointer_(nullptr), batch_count_(1) {
type_(type), batch_stride_(size_t(0)), capacity_(size_t(0)),
pointer_(nullptr), batch_count_(1), device_(device) {
reset(type, layout_id, extent, stride, batch_count);
}
@ -355,7 +359,7 @@ DeviceAllocation &DeviceAllocation::reset(library::NumericTypeID type, size_t ca
batch_stride_ = capacity;
capacity_ = capacity;
cudaError_t result = cudaMalloc((void **)&pointer_, bytes(type_, capacity_));
cudaError_t result = this->malloc((void **)&pointer_, bytes(type_, capacity_));
if (result != cudaSuccess) {
throw std::bad_alloc();
}
@ -373,9 +377,9 @@ DeviceAllocation &DeviceAllocation::reset(library::NumericTypeID type, size_t ca
/// Allocates memory for a given layout and tensor
DeviceAllocation &DeviceAllocation::reset(
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int64_t> const &stride,
int batch_count) {
@ -391,14 +395,14 @@ DeviceAllocation &DeviceAllocation::reset(
batch_count_ = batch_count;
batch_stride_ = construct_layout(
tensor_ref_buffer_.data() + sizeof(pointer_),
layout_id,
extent,
tensor_ref_buffer_.data() + sizeof(pointer_),
layout_id,
extent,
stride_);
capacity_ = batch_stride_ * batch_count_;
cudaError_t result = cudaMalloc((void **)&pointer_, bytes(type, capacity_));
cudaError_t result = this->malloc((void **)&pointer_, bytes(type, capacity_));
if (result != cudaSuccess) {
throw std::bad_alloc();
}
@ -421,7 +425,7 @@ void *DeviceAllocation::data() const {
}
void *DeviceAllocation::batch_data(int batch_idx) const {
return static_cast<char *>(data()) + batch_stride_bytes() * batch_idx;
return static_cast<char *>(data()) + batch_stride_bytes() * batch_idx;
}
library::LayoutTypeID DeviceAllocation::layout() const {
@ -1476,159 +1480,159 @@ void DeviceAllocation::initialize_random_sparsemeta_host(int seed, int MetaSizeI
/// Returns true if two blocks have exactly the same value
bool DeviceAllocation::block_compare_equal(
library::NumericTypeID numeric_type,
void const *ptr_A,
void const *ptr_B,
library::NumericTypeID numeric_type,
void const *ptr_A,
void const *ptr_B,
size_t capacity) {
switch (numeric_type) {
case library::NumericTypeID::kFE4M3:
return reference::device::BlockCompareEqual<float_e4m3_t>(
reinterpret_cast<float_e4m3_t const *>(ptr_A),
reinterpret_cast<float_e4m3_t const *>(ptr_B),
reinterpret_cast<float_e4m3_t const *>(ptr_A),
reinterpret_cast<float_e4m3_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kFE5M2:
return reference::device::BlockCompareEqual<float_e5m2_t>(
reinterpret_cast<float_e5m2_t const *>(ptr_A),
reinterpret_cast<float_e5m2_t const *>(ptr_B),
reinterpret_cast<float_e5m2_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kF16:
return reference::device::BlockCompareEqual<half_t>(
reinterpret_cast<half_t const *>(ptr_A),
reinterpret_cast<half_t const *>(ptr_B),
reinterpret_cast<half_t const *>(ptr_A),
reinterpret_cast<half_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kBF16:
return reference::device::BlockCompareEqual<bfloat16_t>(
reinterpret_cast<bfloat16_t const *>(ptr_A),
reinterpret_cast<bfloat16_t const *>(ptr_B),
reinterpret_cast<bfloat16_t const *>(ptr_A),
reinterpret_cast<bfloat16_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kTF32:
return reference::device::BlockCompareEqual<tfloat32_t>(
reinterpret_cast<tfloat32_t const *>(ptr_A),
reinterpret_cast<tfloat32_t const *>(ptr_B),
reinterpret_cast<tfloat32_t const *>(ptr_A),
reinterpret_cast<tfloat32_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kF32:
return reference::device::BlockCompareEqual<float>(
reinterpret_cast<float const *>(ptr_A),
reinterpret_cast<float const *>(ptr_B),
reinterpret_cast<float const *>(ptr_A),
reinterpret_cast<float const *>(ptr_B),
capacity);
case library::NumericTypeID::kCF32:
return reference::device::BlockCompareEqual<cutlass::complex<float> >(
reinterpret_cast<complex<float> const *>(ptr_A),
reinterpret_cast<complex<float> const *>(ptr_B),
reinterpret_cast<complex<float> const *>(ptr_A),
reinterpret_cast<complex<float> const *>(ptr_B),
capacity);
case library::NumericTypeID::kCF16:
return reference::device::BlockCompareEqual<complex<half_t>>(
reinterpret_cast<complex<half_t> const *>(ptr_A),
reinterpret_cast<complex<half_t> const *>(ptr_B),
reinterpret_cast<complex<half_t> const *>(ptr_A),
reinterpret_cast<complex<half_t> const *>(ptr_B),
capacity);
case library::NumericTypeID::kCBF16:
return reference::device::BlockCompareEqual<complex<bfloat16_t>>(
reinterpret_cast<complex<bfloat16_t> const *>(ptr_A),
reinterpret_cast<complex<bfloat16_t> const *>(ptr_B),
reinterpret_cast<complex<bfloat16_t> const *>(ptr_A),
reinterpret_cast<complex<bfloat16_t> const *>(ptr_B),
capacity);
case library::NumericTypeID::kCTF32:
return reference::device::BlockCompareEqual<complex<tfloat32_t>>(
reinterpret_cast<complex<tfloat32_t> const *>(ptr_A),
reinterpret_cast<complex<tfloat32_t> const *>(ptr_B),
reinterpret_cast<complex<tfloat32_t> const *>(ptr_A),
reinterpret_cast<complex<tfloat32_t> const *>(ptr_B),
capacity);
case library::NumericTypeID::kF64:
return reference::device::BlockCompareEqual<double>(
reinterpret_cast<double const *>(ptr_A),
reinterpret_cast<double const *>(ptr_B),
reinterpret_cast<double const *>(ptr_A),
reinterpret_cast<double const *>(ptr_B),
capacity);
case library::NumericTypeID::kCF64:
return reference::device::BlockCompareEqual<complex<double>>(
reinterpret_cast<complex<double> const *>(ptr_A),
reinterpret_cast<complex<double> const *>(ptr_B),
reinterpret_cast<complex<double> const *>(ptr_A),
reinterpret_cast<complex<double> const *>(ptr_B),
capacity);
case library::NumericTypeID::kS2:
return reference::device::BlockCompareEqual<int2b_t>(
reinterpret_cast<int2b_t const *>(ptr_A),
reinterpret_cast<int2b_t const *>(ptr_B),
reinterpret_cast<int2b_t const *>(ptr_A),
reinterpret_cast<int2b_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kS4:
return reference::device::BlockCompareEqual<int4b_t>(
reinterpret_cast<int4b_t const *>(ptr_A),
reinterpret_cast<int4b_t const *>(ptr_B),
reinterpret_cast<int4b_t const *>(ptr_A),
reinterpret_cast<int4b_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kS8:
return reference::device::BlockCompareEqual<int8_t>(
reinterpret_cast<int8_t const *>(ptr_A),
reinterpret_cast<int8_t const *>(ptr_B),
reinterpret_cast<int8_t const *>(ptr_A),
reinterpret_cast<int8_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kS16:
return reference::device::BlockCompareEqual<int16_t>(
reinterpret_cast<int16_t const *>(ptr_A),
reinterpret_cast<int16_t const *>(ptr_B),
reinterpret_cast<int16_t const *>(ptr_A),
reinterpret_cast<int16_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kS32:
return reference::device::BlockCompareEqual<int32_t>(
reinterpret_cast<int32_t const *>(ptr_A),
reinterpret_cast<int32_t const *>(ptr_B),
reinterpret_cast<int32_t const *>(ptr_A),
reinterpret_cast<int32_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kS64:
return reference::device::BlockCompareEqual<int64_t>(
reinterpret_cast<int64_t const *>(ptr_A),
reinterpret_cast<int64_t const *>(ptr_B),
reinterpret_cast<int64_t const *>(ptr_A),
reinterpret_cast<int64_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kB1:
return reference::device::BlockCompareEqual<uint1b_t>(
reinterpret_cast<uint1b_t const *>(ptr_A),
reinterpret_cast<uint1b_t const *>(ptr_B),
reinterpret_cast<uint1b_t const *>(ptr_A),
reinterpret_cast<uint1b_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kU2:
return reference::device::BlockCompareEqual<uint2b_t>(
reinterpret_cast<uint2b_t const *>(ptr_A),
reinterpret_cast<uint2b_t const *>(ptr_B),
reinterpret_cast<uint2b_t const *>(ptr_A),
reinterpret_cast<uint2b_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kU4:
return reference::device::BlockCompareEqual<uint4b_t>(
reinterpret_cast<uint4b_t const *>(ptr_A),
reinterpret_cast<uint4b_t const *>(ptr_B),
reinterpret_cast<uint4b_t const *>(ptr_A),
reinterpret_cast<uint4b_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kU8:
return reference::device::BlockCompareEqual<uint8_t>(
reinterpret_cast<uint8_t const *>(ptr_A),
reinterpret_cast<uint8_t const *>(ptr_B),
reinterpret_cast<uint8_t const *>(ptr_A),
reinterpret_cast<uint8_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kU16:
return reference::device::BlockCompareEqual<uint16_t>(
reinterpret_cast<uint16_t const *>(ptr_A),
reinterpret_cast<uint16_t const *>(ptr_B),
reinterpret_cast<uint16_t const *>(ptr_A),
reinterpret_cast<uint16_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kU32:
return reference::device::BlockCompareEqual<uint32_t>(
reinterpret_cast<uint32_t const *>(ptr_A),
reinterpret_cast<uint32_t const *>(ptr_B),
reinterpret_cast<uint32_t const *>(ptr_A),
reinterpret_cast<uint32_t const *>(ptr_B),
capacity);
case library::NumericTypeID::kU64:
return reference::device::BlockCompareEqual<uint64_t>(
reinterpret_cast<uint64_t const *>(ptr_A),
reinterpret_cast<uint64_t const *>(ptr_B),
reinterpret_cast<uint64_t const *>(ptr_A),
reinterpret_cast<uint64_t const *>(ptr_B),
capacity);
default:
@ -1638,9 +1642,9 @@ bool DeviceAllocation::block_compare_equal(
/// Returns true if two blocks have approximately the same value
bool DeviceAllocation::block_compare_relatively_equal(
library::NumericTypeID numeric_type,
void const *ptr_A,
void const *ptr_B,
library::NumericTypeID numeric_type,
void const *ptr_A,
void const *ptr_B,
size_t capacity,
double epsilon,
double nonzero_floor) {
@ -1648,161 +1652,161 @@ bool DeviceAllocation::block_compare_relatively_equal(
switch (numeric_type) {
case library::NumericTypeID::kFE4M3:
return reference::device::BlockCompareRelativelyEqual<float_e4m3_t>(
reinterpret_cast<float_e4m3_t const *>(ptr_A),
reinterpret_cast<float_e4m3_t const *>(ptr_A),
reinterpret_cast<float_e4m3_t const *>(ptr_B),
capacity,
static_cast<float_e4m3_t>(epsilon),
capacity,
static_cast<float_e4m3_t>(epsilon),
static_cast<float_e4m3_t>(nonzero_floor));
case library::NumericTypeID::kFE5M2:
return reference::device::BlockCompareRelativelyEqual<float_e5m2_t>(
reinterpret_cast<float_e5m2_t const *>(ptr_A),
reinterpret_cast<float_e5m2_t const *>(ptr_A),
reinterpret_cast<float_e5m2_t const *>(ptr_B),
capacity,
static_cast<float_e5m2_t>(epsilon),
capacity,
static_cast<float_e5m2_t>(epsilon),
static_cast<float_e5m2_t>(nonzero_floor));
case library::NumericTypeID::kF16:
return reference::device::BlockCompareRelativelyEqual<half_t>(
reinterpret_cast<half_t const *>(ptr_A),
reinterpret_cast<half_t const *>(ptr_A),
reinterpret_cast<half_t const *>(ptr_B),
capacity,
static_cast<half_t>(epsilon),
capacity,
static_cast<half_t>(epsilon),
static_cast<half_t>(nonzero_floor));
case library::NumericTypeID::kBF16:
return reference::device::BlockCompareRelativelyEqual<bfloat16_t>(
reinterpret_cast<bfloat16_t const *>(ptr_A),
reinterpret_cast<bfloat16_t const *>(ptr_A),
reinterpret_cast<bfloat16_t const *>(ptr_B),
capacity,
static_cast<bfloat16_t>(epsilon),
capacity,
static_cast<bfloat16_t>(epsilon),
static_cast<bfloat16_t>(nonzero_floor));
case library::NumericTypeID::kTF32:
return reference::device::BlockCompareRelativelyEqual<tfloat32_t>(
reinterpret_cast<tfloat32_t const *>(ptr_A),
reinterpret_cast<tfloat32_t const *>(ptr_A),
reinterpret_cast<tfloat32_t const *>(ptr_B),
capacity,
static_cast<tfloat32_t>(epsilon),
capacity,
static_cast<tfloat32_t>(epsilon),
static_cast<tfloat32_t>(nonzero_floor));
case library::NumericTypeID::kF32:
return reference::device::BlockCompareRelativelyEqual<float>(
reinterpret_cast<float const *>(ptr_A),
reinterpret_cast<float const *>(ptr_A),
reinterpret_cast<float const *>(ptr_B),
capacity,
static_cast<float>(epsilon),
capacity,
static_cast<float>(epsilon),
static_cast<float>(nonzero_floor));
case library::NumericTypeID::kF64:
return reference::device::BlockCompareRelativelyEqual<double>(
reinterpret_cast<double const *>(ptr_A),
reinterpret_cast<double const *>(ptr_A),
reinterpret_cast<double const *>(ptr_B),
capacity,
static_cast<double>(epsilon),
capacity,
static_cast<double>(epsilon),
static_cast<double>(nonzero_floor));
case library::NumericTypeID::kS2:
return reference::device::BlockCompareRelativelyEqual<int2b_t>(
reinterpret_cast<int2b_t const *>(ptr_A),
reinterpret_cast<int2b_t const *>(ptr_A),
reinterpret_cast<int2b_t const *>(ptr_B),
capacity,
static_cast<int2b_t>(epsilon),
capacity,
static_cast<int2b_t>(epsilon),
static_cast<int2b_t>(nonzero_floor));
case library::NumericTypeID::kS4:
return reference::device::BlockCompareRelativelyEqual<int4b_t>(
reinterpret_cast<int4b_t const *>(ptr_A),
reinterpret_cast<int4b_t const *>(ptr_A),
reinterpret_cast<int4b_t const *>(ptr_B),
capacity,
static_cast<int4b_t>(epsilon),
capacity,
static_cast<int4b_t>(epsilon),
static_cast<int4b_t>(nonzero_floor));
case library::NumericTypeID::kS8:
return reference::device::BlockCompareRelativelyEqual<int8_t>(
reinterpret_cast<int8_t const *>(ptr_A),
reinterpret_cast<int8_t const *>(ptr_A),
reinterpret_cast<int8_t const *>(ptr_B),
capacity,
static_cast<int8_t>(epsilon),
capacity,
static_cast<int8_t>(epsilon),
static_cast<int8_t>(nonzero_floor));
case library::NumericTypeID::kS16:
return reference::device::BlockCompareRelativelyEqual<int16_t>(
reinterpret_cast<int16_t const *>(ptr_A),
reinterpret_cast<int16_t const *>(ptr_A),
reinterpret_cast<int16_t const *>(ptr_B),
capacity,
static_cast<int16_t>(epsilon),
capacity,
static_cast<int16_t>(epsilon),
static_cast<int16_t>(nonzero_floor));
case library::NumericTypeID::kS32:
return reference::device::BlockCompareRelativelyEqual<int32_t>(
reinterpret_cast<int32_t const *>(ptr_A),
reinterpret_cast<int32_t const *>(ptr_A),
reinterpret_cast<int32_t const *>(ptr_B),
capacity,
static_cast<int32_t>(epsilon),
capacity,
static_cast<int32_t>(epsilon),
static_cast<int32_t>(nonzero_floor));
case library::NumericTypeID::kS64:
return reference::device::BlockCompareRelativelyEqual<int64_t>(
reinterpret_cast<int64_t const *>(ptr_A),
reinterpret_cast<int64_t const *>(ptr_A),
reinterpret_cast<int64_t const *>(ptr_B),
capacity,
static_cast<int64_t>(epsilon),
capacity,
static_cast<int64_t>(epsilon),
static_cast<int64_t>(nonzero_floor));
case library::NumericTypeID::kB1:
return reference::device::BlockCompareRelativelyEqual<uint1b_t>(
reinterpret_cast<uint1b_t const *>(ptr_A),
reinterpret_cast<uint1b_t const *>(ptr_A),
reinterpret_cast<uint1b_t const *>(ptr_B),
capacity,
static_cast<uint1b_t>(epsilon),
capacity,
static_cast<uint1b_t>(epsilon),
static_cast<uint1b_t>(nonzero_floor));
case library::NumericTypeID::kU2:
return reference::device::BlockCompareRelativelyEqual<uint2b_t>(
reinterpret_cast<uint2b_t const *>(ptr_A),
reinterpret_cast<uint2b_t const *>(ptr_A),
reinterpret_cast<uint2b_t const *>(ptr_B),
capacity,
static_cast<uint2b_t>(epsilon),
capacity,
static_cast<uint2b_t>(epsilon),
static_cast<uint2b_t>(nonzero_floor));
case library::NumericTypeID::kU4:
return reference::device::BlockCompareRelativelyEqual<uint4b_t>(
reinterpret_cast<uint4b_t const *>(ptr_A),
reinterpret_cast<uint4b_t const *>(ptr_A),
reinterpret_cast<uint4b_t const *>(ptr_B),
capacity,
static_cast<uint4b_t>(epsilon),
capacity,
static_cast<uint4b_t>(epsilon),
static_cast<uint4b_t>(nonzero_floor));
case library::NumericTypeID::kU8:
return reference::device::BlockCompareRelativelyEqual<uint8_t>(
reinterpret_cast<uint8_t const *>(ptr_A),
reinterpret_cast<uint8_t const *>(ptr_A),
reinterpret_cast<uint8_t const *>(ptr_B),
capacity,
static_cast<uint8_t>(epsilon),
capacity,
static_cast<uint8_t>(epsilon),
static_cast<uint8_t>(nonzero_floor));
case library::NumericTypeID::kU16:
return reference::device::BlockCompareRelativelyEqual<uint16_t>(
reinterpret_cast<uint16_t const *>(ptr_A),
reinterpret_cast<uint16_t const *>(ptr_A),
reinterpret_cast<uint16_t const *>(ptr_B),
capacity,
static_cast<uint16_t>(epsilon),
capacity,
static_cast<uint16_t>(epsilon),
static_cast<uint16_t>(nonzero_floor));
case library::NumericTypeID::kU32:
return reference::device::BlockCompareRelativelyEqual<uint32_t>(
reinterpret_cast<uint32_t const *>(ptr_A),
reinterpret_cast<uint32_t const *>(ptr_A),
reinterpret_cast<uint32_t const *>(ptr_B),
capacity,
static_cast<uint32_t>(epsilon),
capacity,
static_cast<uint32_t>(epsilon),
static_cast<uint32_t>(nonzero_floor));
case library::NumericTypeID::kU64:
return reference::device::BlockCompareRelativelyEqual<uint64_t>(
reinterpret_cast<uint64_t const *>(ptr_A),
reinterpret_cast<uint64_t const *>(ptr_A),
reinterpret_cast<uint64_t const *>(ptr_B),
capacity,
static_cast<uint64_t>(epsilon),
capacity,
static_cast<uint64_t>(epsilon),
static_cast<uint64_t>(nonzero_floor));
// No relatively equal comparison for complex numbers.
@ -1821,7 +1825,7 @@ bool DeviceAllocation::block_compare_relatively_equal(
reinterpret_cast<complex<float> const *>(ptr_A),
reinterpret_cast<complex<float> const *>(ptr_B),
capacity);
case library::NumericTypeID::kCF64:
return reference::device::BlockCompareEqual<cutlass::complex<double> >(
reinterpret_cast<complex<double> const *>(ptr_A),
@ -1837,14 +1841,14 @@ bool DeviceAllocation::block_compare_relatively_equal(
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Permits copying dynamic vectors into static-length vectors
/// Permits copying dynamic vectors into static-length vectors
template <typename TensorCoord, int Rank>
struct vector_to_coord {
vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {
coord[Rank - 1] = vec.at(Rank - 1);
if (Rank > 1) {
vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
}
@ -1853,17 +1857,17 @@ struct vector_to_coord {
vector_to_coord(TensorCoord &coord, std::vector<int64_t> const &vec) {
coord[Rank - 1] = (int)vec.at(Rank - 1);
if (Rank > 1) {
vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
}
}
};
/// Permits copying dynamic vectors into static-length vectors
/// Permits copying dynamic vectors into static-length vectors
template <typename TensorCoord>
struct vector_to_coord<TensorCoord, 1> {
vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {
coord[0] = vec.at(0);
@ -1875,10 +1879,10 @@ struct vector_to_coord<TensorCoord, 1> {
}
};
/// Permits copying dynamic vectors into static-length vectors
/// Permits copying dynamic vectors into static-length vectors
template <typename TensorCoord>
struct vector_to_coord<TensorCoord, 0> {
vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {
}
@ -1888,7 +1892,7 @@ struct vector_to_coord<TensorCoord, 0> {
template <typename Element, typename Layout>
static void write_tensor_csv_static_tensor_view(
std::ostream &out,
std::ostream &out,
DeviceAllocation &allocation) {
Coord<Layout::kRank> extent;
@ -1903,7 +1907,7 @@ static void write_tensor_csv_static_tensor_view(
}
vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
vector_to_coord<Coord<Layout::kStrideRank, typename Layout::Stride::Index>,
vector_to_coord<Coord<Layout::kStrideRank, typename Layout::Stride::Index>,
Layout::kStrideRank>(stride, allocation.stride());
Layout layout(stride);
@ -1914,7 +1918,7 @@ static void write_tensor_csv_static_tensor_view(
}
host_tensor.copy_in_device_to_host(
static_cast<Element const *>(allocation.data()),
static_cast<Element const *>(allocation.data()),
allocation.batch_stride());
TensorViewWrite(out, host_tensor.host_view());
@ -1926,7 +1930,7 @@ static void write_tensor_csv_static_tensor_view(
template <typename T>
static void write_tensor_csv_static_type(
std::ostream &out,
std::ostream &out,
DeviceAllocation &allocation) {
switch (allocation.layout()) {
@ -1991,7 +1995,7 @@ static void write_tensor_csv_static_type(
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Writes a tensor to csv
/// Writes a tensor to csv
void DeviceAllocation::write_tensor_csv(
std::ostream &out) {
@ -1999,14 +2003,14 @@ void DeviceAllocation::write_tensor_csv(
case library::NumericTypeID::kFE4M3:
write_tensor_csv_static_type<float_e4m3_t>(out, *this);
break;
case library::NumericTypeID::kFE5M2:
write_tensor_csv_static_type<float_e5m2_t>(out, *this);
break;
case library::NumericTypeID::kF16:
write_tensor_csv_static_type<half_t>(out, *this);
break;
case library::NumericTypeID::kBF16:
write_tensor_csv_static_type<bfloat16_t>(out, *this);
break;
@ -2022,7 +2026,7 @@ void DeviceAllocation::write_tensor_csv(
case library::NumericTypeID::kF64:
write_tensor_csv_static_type<double>(out, *this);
break;
case library::NumericTypeID::kS2:
write_tensor_csv_static_type<int2b_t>(out, *this);
break;
@ -2046,7 +2050,7 @@ void DeviceAllocation::write_tensor_csv(
case library::NumericTypeID::kS64:
write_tensor_csv_static_type<int64_t>(out, *this);
break;
case library::NumericTypeID::kB1:
write_tensor_csv_static_type<uint1b_t>(out, *this);
break;
@ -2074,7 +2078,7 @@ void DeviceAllocation::write_tensor_csv(
case library::NumericTypeID::kU64:
write_tensor_csv_static_type<uint64_t>(out, *this);
break;
case library::NumericTypeID::kCF16:
write_tensor_csv_static_type<cutlass::complex<half_t> >(out, *this);
break;
@ -2110,7 +2114,7 @@ static void tensor_fill_tensor_view(DeviceAllocation &allocation, Element val =
}
vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
vector_to_coord<Coord<Layout::kStrideRank, typename Layout::LongIndex>,
vector_to_coord<Coord<Layout::kStrideRank, typename Layout::LongIndex>,
Layout::kStrideRank>(stride, allocation.stride());
TensorView<Element, Layout> view(
@ -2432,6 +2436,46 @@ void DeviceAllocation::fill_host(double val = 0.0) {
copy_from_host(host_data.data());
}
cudaError_t DeviceAllocation::malloc(void** ptr, size_t size) {
cudaError_t result;
int set_device_back_to = -1;
/// When needed this sets the device to the allocation's device remembering
/// the current device so that it can be set back after the cudaMalloc is
/// performed.
if (device_ >= 0) {
int current_device;
result = cudaGetDevice(&current_device);
if (result != cudaSuccess) {
return result;
}
if (current_device != device_) {
set_device_back_to = current_device;
result = cudaSetDevice(device_);
if (result != cudaSuccess) {
return result;
}
}
}
// This performs the cudaMalloc
result = cudaMalloc(ptr, size);
if (result != cudaSuccess) {
return result;
}
/// When needed this sets the device back to what it was when the function was
/// called.
if (set_device_back_to != -1) {
result = cudaSetDevice(set_device_back_to);
if (result != cudaSuccess) {
return result;
}
}
return cudaSuccess;
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -29,7 +29,7 @@
*
**************************************************************************************************/
/* \file
\brief
\brief
*/
#include "cutlass/profiler/device_context.h"
@ -41,29 +41,16 @@ namespace profiler {
/// Allocates memory of a given type, capacity (elements), and name
DeviceAllocation *DeviceContext::allocate_block(
Options const &options,
std::string const &name,
library::NumericTypeID type,
size_t capacity) {
library::NumericTypeID type,
size_t capacity,
size_t device_index) {
device_memory_.emplace_back(type, capacity);
int device = options.device.device_id(device_index);
device_memory_.emplace_back(type, capacity, device);
DeviceAllocation *allocation = &device_memory_.back();
allocations_[name] = allocation;
return allocation;
}
/// Allocates memory of a given type, capacity (elements), and name
DeviceAllocation *DeviceContext::allocate_tensor(
std::string const &name,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int64_t> const &stride,
int batch_count) {
device_memory_.emplace_back(type, layout_id, extent, stride, batch_count);
DeviceAllocation *allocation = &device_memory_.back();
allocations_[name] = allocation;
return allocation;
}
@ -72,18 +59,40 @@ DeviceAllocation *DeviceContext::allocate_tensor(
DeviceAllocation *DeviceContext::allocate_tensor(
Options const &options,
std::string const &name,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int64_t> const &stride,
int batch_count,
int seed_shift) {
size_t device_index) {
DeviceAllocation *allocation =
allocate_tensor(name, type, layout_id, extent, stride, batch_count);
int device = options.device.device_id(device_index);
device_memory_.emplace_back(type, layout_id, extent, stride, batch_count,
device);
DeviceAllocation *allocation = &device_memory_.back();
allocations_[name] = allocation;
return allocation;
}
/// Allocates memory of a given type, capacity (elements), and name
DeviceAllocation *DeviceContext::allocate_and_initialize_tensor(
Options const &options,
std::string const &name,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
std::vector<int> const &extent,
std::vector<int64_t> const &stride,
int batch_count,
int seed_shift,
size_t device_index) {
DeviceAllocation *allocation =
allocate_tensor(options, name, type, layout_id, extent, stride,
batch_count, device_index);
if (options.initialization.enabled) {
Distribution data_distribution = options.initialization.data_distribution;
Distribution data_distribution = options.initialization.data_distribution;
// check if data distribution is allowed to change
if(!options.initialization.fix_data_distribution) {
@ -129,13 +138,13 @@ DeviceAllocation *DeviceContext::allocate_tensor(
double stddev = data_distribution.gaussian.stddev;
int scale = data_distribution.int_scale;
if (name == "A" && data_distribution.gaussian.pnzA != 100.0) {
if (name == "A" && data_distribution.gaussian.pnzA != 1.0) {
data_distribution.set_gaussian(mean, stddev, scale, data_distribution.gaussian.pnzA);
}
else if (name == "B" && data_distribution.gaussian.pnzB != 100.0) {
else if (name == "B" && data_distribution.gaussian.pnzB != 1.0) {
data_distribution.set_gaussian(mean, stddev, scale, data_distribution.gaussian.pnzB);
}
else if (name == "C" && data_distribution.gaussian.pnzC != 100.0) {
else if (name == "C" && data_distribution.gaussian.pnzC != 1.0) {
data_distribution.set_gaussian(mean, stddev, scale, data_distribution.gaussian.pnzC);
}
}
@ -147,7 +156,7 @@ DeviceAllocation *DeviceContext::allocate_tensor(
}
else {
allocation->initialize_random_device(
options.initialization.seed + seed_shift,
options.initialization.seed + seed_shift,
data_distribution);
}
}
@ -158,7 +167,7 @@ DeviceAllocation *DeviceContext::allocate_tensor(
}
else {
allocation->initialize_random_host(
options.initialization.seed + seed_shift,
options.initialization.seed + seed_shift,
data_distribution);
}
}
@ -167,20 +176,22 @@ DeviceAllocation *DeviceContext::allocate_tensor(
return allocation;
}
/// Allocates memory for sparse meta data
DeviceAllocation *DeviceContext::allocate_sparsemeta_tensor(
/// Allocates memory for sparse meta data
DeviceAllocation *DeviceContext::allocate_and_initialize_sparsemeta_tensor(
Options const &options,
std::string const &name,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
library::NumericTypeID type,
library::LayoutTypeID layout_id,
library::NumericTypeID type_a,
std::vector<int> const &extent,
std::vector<int> const &extent,
std::vector<int64_t> const &stride,
int batch_count,
int seed_shift) {
int seed_shift,
size_t device_index) {
DeviceAllocation *allocation =
allocate_tensor(name, type, layout_id, extent, stride, batch_count);
DeviceAllocation *allocation =
allocate_tensor(options, name, type, layout_id, extent, stride,
batch_count, device_index);
if (options.initialization.enabled) {
// TF32 has 4bit meta data. The rest has 2bit.
@ -188,12 +199,12 @@ DeviceAllocation *DeviceContext::allocate_sparsemeta_tensor(
if (options.initialization.provider == library::Provider::kReferenceDevice) {
allocation->initialize_random_sparsemeta_device(
options.initialization.seed + seed_shift,
options.initialization.seed + seed_shift,
MetaSizeInBits);
}
else if (options.initialization.provider == library::Provider::kReferenceHost) {
allocation->initialize_random_sparsemeta_host(
options.initialization.seed + seed_shift,
options.initialization.seed + seed_shift,
MetaSizeInBits);
}
}

View File

@ -39,6 +39,7 @@
#include <vector>
#include "cutlass/core_io.h"
#include <cuda_runtime_api.h>
#include "cutlass/profiler/cublas_helpers.h"
#include "cutlass/profiler/gemm_operation_profiler.h"
@ -46,7 +47,6 @@
#include "cutlass/library/singleton.h"
#include "cutlass/library/library.h"
#include "cutlass/library/handle.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
@ -485,6 +485,17 @@ Status GemmOperationProfiler::initialize_workspace(
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
if (options.device.devices.size() != 1) {
throw std::runtime_error("This operation profiler only supports a single "
"device.");
}
cudaError_t result;
result = cudaSetDevice(options.device.device_id(0));
if (result != cudaSuccess) {
throw std::runtime_error("cudaSetDevice() failed.");
}
library::Operation const* underlying_operation = operation;
if (problem_.split_k_mode == library::SplitKMode::kParallel) {
@ -496,12 +507,14 @@ Status GemmOperationProfiler::initialize_workspace(
library::GemmDescription const &operation_desc =
static_cast<library::GemmDescription const &>(operation->description());
bool is_sparse = operation_desc.tile_description.math_instruction.opcode_class == cutlass::library::OpcodeClassID::kSparseTensorOp;
// Compute the number of copies of the problem to avoid L2 camping.
if (!options.profiling.workspace_count) {
int64_t bytes = problem_.bytes(operation_desc);
if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
gemm_workspace_.problem_count =
1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
}
else {
gemm_workspace_.problem_count = 1;
@ -514,7 +527,7 @@ Status GemmOperationProfiler::initialize_workspace(
bool allocate_device_tensors = options.execution_mode != ExecutionMode::kDryRun;
if (allocate_device_tensors) {
int seed_shift = 0;
gemm_workspace_.A = device_context.allocate_tensor(
gemm_workspace_.A = device_context.allocate_and_initialize_tensor(
options,
"A",
operation_desc.A.element,
@ -522,10 +535,11 @@ Status GemmOperationProfiler::initialize_workspace(
{int(problem_.m), int(problem_.k)},
{int(problem_.lda)},
problem_.batch_count * gemm_workspace_.problem_count,
seed_shift++
seed_shift++,
0 // device_index
);
gemm_workspace_.B = device_context.allocate_tensor(
gemm_workspace_.B = device_context.allocate_and_initialize_tensor(
options,
"B",
operation_desc.B.element,
@ -533,10 +547,11 @@ Status GemmOperationProfiler::initialize_workspace(
{int(problem_.k), int(problem_.n)},
{int(problem_.ldb)},
problem_.batch_count * gemm_workspace_.problem_count,
seed_shift++
seed_shift++,
0 // device_index
);
gemm_workspace_.C = device_context.allocate_tensor(
gemm_workspace_.C = device_context.allocate_and_initialize_tensor(
options,
"C",
operation_desc.C.element,
@ -544,25 +559,30 @@ Status GemmOperationProfiler::initialize_workspace(
{int(problem_.m), int(problem_.n)},
{int(problem_.ldc)},
problem_.batch_count * gemm_workspace_.problem_count,
seed_shift++
seed_shift++,
0 // device_index
);
gemm_workspace_.Computed = device_context.allocate_tensor(
options,
"D",
operation_desc.D.element,
operation_desc.D.layout,
{int(problem_.m), int(problem_.n)},
{int(problem_.ldc)},
problem_.batch_count * gemm_workspace_.problem_count
problem_.batch_count * gemm_workspace_.problem_count,
0 // device_index
);
gemm_workspace_.Reference = device_context.allocate_tensor(
options,
"Reference",
operation_desc.D.element,
operation_desc.D.layout,
{int(problem_.m), int(problem_.n)},
{int(problem_.ldc)},
problem_.batch_count * gemm_workspace_.problem_count
problem_.batch_count * gemm_workspace_.problem_count,
0 // device_index
);
}
@ -580,7 +600,7 @@ Status GemmOperationProfiler::initialize_workspace(
gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Computed->batch_stride();
/* Query device SM count to pass onto the kernel as an argument, where needed */
gemm_workspace_.arguments.sm_count = options.device.properties.multiProcessorCount;
gemm_workspace_.arguments.sm_count = options.device.properties[0].multiProcessorCount;
}
//
@ -596,12 +616,34 @@ Status GemmOperationProfiler::initialize_workspace(
workspace_size = underlying_operation->get_device_workspace_size(&gemm_workspace_.configuration,
&gemm_workspace_.arguments);
if (is_sparse) {
// sparse gemm get_device_workspace_size() only return device workspace size per iteration
// Needs to multiply it w/ number of iteration
workspace_size *= gemm_workspace_.problem_count;
}
gemm_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
status = underlying_operation->initialize(
&gemm_workspace_.configuration,
gemm_workspace_.host_workspace.data(),
gemm_workspace_.device_workspace.data());
// Convert to structure sparse contents here.
if (is_sparse) {
uint8_t* profiler_workspaces[1];
profiler_workspaces[0] = reinterpret_cast<uint8_t*>(gemm_workspace_.A->data());
// Sparse operations have a different initialize interface.
// initialize_with_profiler_workspace converts mxk tensorA to compressed mxk/sp tensorA and the tensorE
auto modifiable_underlying_op = const_cast<library::Operation*>(underlying_operation);
status = modifiable_underlying_op->initialize_with_profiler_workspace(
&gemm_workspace_.configuration,
gemm_workspace_.host_workspace.data(),
gemm_workspace_.device_workspace.data(),
profiler_workspaces,
gemm_workspace_.problem_count);
}
else {
status = underlying_operation->initialize(
&gemm_workspace_.configuration,
gemm_workspace_.host_workspace.data(),
gemm_workspace_.device_workspace.data());
}
if (status != Status::kSuccess) {
return status;
}
@ -821,26 +863,14 @@ bool GemmOperationProfiler::verify_with_cublas_(
// Construct cuBLAS operators
//
CublasCreate handle;
cublasStatus_t status = handle.get_cublas_create_status();
CublasLtCreate handle;
cublasStatus_t status = handle.get_cublaslt_create_status();
if (status != CUBLAS_STATUS_SUCCESS) {
results_.back().verification_map[library::Provider::kCUBLAS] = get_cutlass_disposition(status);
return true;
}
std::vector<cublasGemmAlgo_t> algorithms;
detail::select_cublas_algorithms(
algorithms,
options,
gemm_desc);
if (algorithms.empty()) {
// no algorithm selected
return true;
}
//
// Initialize state
@ -865,29 +895,34 @@ bool GemmOperationProfiler::verify_with_cublas_(
gemm_workspace_.arguments.beta = problem_.beta.data();
gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
detail::cublasGemmExDispatcher gemm_op(
detail::cublasLtGemmExDispatcher gemm_op(
gemm_desc,
gemm_workspace_.configuration,
gemm_workspace_.arguments,
algorithms.front()
gemm_workspace_.arguments
);
gemm_op.initialize_cublaslt();
if(!gemm_op.get_cublaslt_algo(handle, AlgorithmMode::kDefault)){
return true;
}
if (gemm_op.status != Status::kSuccess) {
results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotRun;
return true;
}
results_.back().status = Status::kSuccess;
status = gemm_op(handle);
// Handle errors
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "cublasLt Verification run failed with status : " << cublasLtGetStatusName(status) << "\n";
results_.back().verification_map[library::Provider::kCUBLAS] = get_cutlass_disposition(status);
return true;
}
results_.back().status = Status::kSuccess;
//
// Verify results
//
@ -930,9 +965,9 @@ bool GemmOperationProfiler::verify_with_reference_(
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem,
cutlass::library::NumericTypeID element_A,
cutlass::library::NumericTypeID element_B)
ProblemSpace::Problem const &problem,
cutlass::library::NumericTypeID element_A,
cutlass::library::NumericTypeID element_B)
{
library::GemmDescription const &gemm_desc =
static_cast<library::GemmDescription const &>(operation->description());

View File

@ -376,14 +376,14 @@ int OperationProfiler::profile_all(
std::cerr << " @ provider " << operation->description().provider
<< " != library::Provider::kCUTLASS\n";
}
if (options.device.compute_capability() < min_cc) {
if (options.device.compute_capability(0) < min_cc) {
std::cerr << " @ compute_capability "
<< options.device.compute_capability()
<< options.device.compute_capability(0)
<< " < min_cc " << min_cc << "\n";
}
if (options.device.compute_capability() > max_cc) {
if (options.device.compute_capability(0) > max_cc) {
std::cerr << " @ compute_capability "
<< options.device.compute_capability()
<< options.device.compute_capability(0)
<< " > max_cc " << max_cc << "\n";
}
#endif
@ -391,8 +391,8 @@ int OperationProfiler::profile_all(
// Execute compatible cutlass operations if they satisfy the current device's compute capability
if (operation->description().kind == kind_ &&
operation->description().provider == library::Provider::kCUTLASS &&
options.device.compute_capability() >= min_cc &&
options.device.compute_capability() <= max_cc) {
options.device.compute_capability(0) >= min_cc &&
options.device.compute_capability(0) <= max_cc) {
std::string operation_name(operation->description().name);
// Filter kernels by name

View File

@ -33,6 +33,7 @@
*/
#include <algorithm>
#include <set>
#include "cutlass/cutlass.h"
#include "cutlass/version.h"
@ -55,45 +56,97 @@ static char const *end_of_line = "\n
Options::Device::Device(cutlass::CommandLine const &cmdline) {
cmdline.get_cmd_line_argument("device", device, 0);
// Gets the number of devices for future validation
cudaError_t result;
result = cudaGetDeviceProperties(&properties, device);
result = cudaGetDeviceCount(&num_devices);
if (result != cudaSuccess) {
throw std::runtime_error("cudaGetDeviceProperties() failed for given device");
throw std::runtime_error("cudaGetNumDevices() failed");
}
result = cudaSetDevice(device);
if (result != cudaSuccess) {
throw std::runtime_error("cudaSetDevice() failed for given device.");
}
// Permit overriding the compute capability
if (cmdline.check_cmd_line_flag("compute-capability")) {
int cc = compute_capability();
cmdline.get_cmd_line_argument("compute-capability", cc, cc);
properties.major = cc / 10;
properties.minor = cc % 10;
}
// Permit overriding the L2 cache capacity
if (cmdline.check_cmd_line_flag("llc-capacity")) {
int llc_capacity = 0;
cmdline.get_cmd_line_argument("llc-capacity", llc_capacity, 0);
if (llc_capacity >= 0) {
properties.l2CacheSize = (llc_capacity << 10);
// Gets the devices specified by the user
// This preserves the user specified order and checks for duplicates
{
std::vector<int> temp_device_list;
cmdline.get_cmd_line_arguments("devices", temp_device_list);
if (temp_device_list.empty()) {
temp_device_list.push_back(0);
}
{
std::set<int> temp_device_set;
for (int device : temp_device_list) {
auto res = temp_device_set.insert(device);
if (!res.second) {
throw std::runtime_error("Duplicate device specified: " +
std::to_string(device));
} else if (device > num_devices) {
throw std::runtime_error("Bad device ID: " +
std::to_string(device));
} else {
devices.push_back(device);
}
}
}
}
properties.resize(devices.size());
// Retrieves properties for all specified devices
for (size_t device_index = 0; device_index < devices.size(); device_index++) {
int device = devices[device_index];
result = cudaGetDeviceProperties(&properties[device_index], device);
if (result != cudaSuccess) {
throw std::runtime_error("cudaGetDeviceProperties() failed for given device");
}
// Check that all devices are the same
if (device_index > 0) {
if ((properties[device_index].major != properties[0].major) ||
(properties[device_index].minor != properties[0].minor)) {
throw std::runtime_error("All selected devices must have the same "
"compute capability");
}
if (properties[device_index].l2CacheSize != properties[0].l2CacheSize) {
throw std::runtime_error("All selected devices must have the same "
"L2 cache size");
}
if (properties[device_index].multiProcessorCount != properties[0].multiProcessorCount) {
throw std::runtime_error("All selected devices must have the same "
"SM count");
}
}
result = cudaSetDevice(device);
if (result != cudaSuccess) {
throw std::runtime_error("cudaSetDevice() failed for given device.");
}
// Permit overriding the compute capability
if (cmdline.check_cmd_line_flag("compute-capability")) {
int cc = compute_capability(device_index);
cmdline.get_cmd_line_argument("compute-capability", cc, cc);
properties[device_index].major = cc / 10;
properties[device_index].minor = cc % 10;
}
// Permit overriding the L2 cache capacity
if (cmdline.check_cmd_line_flag("llc-capacity")) {
int llc_capacity = 0;
cmdline.get_cmd_line_argument("llc-capacity", llc_capacity, 0);
if (llc_capacity >= 0) {
properties[device_index].l2CacheSize = (llc_capacity << 10);
}
}
}
}
void Options::Device::print_usage(std::ostream &out) const {
out << "Device:\n"
<< " --device=<int> "
<< " CUDA Device ID\n\n";
<< " --devices=<int>,<int>,... "
<< " CUDA Device IDs\n\n";
int device_count = 0;
cudaError_t result = cudaGetDeviceCount(&device_count);
@ -111,11 +164,11 @@ void Options::Device::print_usage(std::ostream &out) const {
break;
}
else {
out << " [" << idx << "] - "
<< prop.name << " - SM " << prop.major << "." << prop.minor << ", "
<< prop.multiProcessorCount << " SMs @ " << (prop.clockRate / 1000.0) << " MHz, "
out << " [" << idx << "] - "
<< prop.name << " - SM " << prop.major << "." << prop.minor << ", "
<< prop.multiProcessorCount << " SMs @ " << (prop.clockRate / 1000.0) << " MHz, "
<< "L2 cache: " << (prop.l2CacheSize >> 20) << " MB, Global Memory: " << (prop.totalGlobalMem >> 30) << " GB"
<< std::endl;
<< std::endl;
}
}
out << "\n";
@ -133,15 +186,8 @@ void Options::Device::print_usage(std::ostream &out) const {
}
void Options::Device::print_device_info(std::ostream &out) const {
int num_devices;
cudaDeviceProp props;
cudaError_t result;
result = cudaGetDeviceCount(&num_devices);
if (result != cudaSuccess) {
throw std::runtime_error("cudaGetNumDevices() failed");
}
out << "Device Name,SM,CUDA Device ID,Phy Device ID" << std::endl;
@ -165,14 +211,28 @@ void Options::Device::print_device_info(std::ostream &out) const {
void Options::Device::print_options(std::ostream &out, int indent) const {
out
<< indent_str(indent) << "device: " << device << "\n"
<< indent_str(indent) << "clock: " << int(double(properties.clockRate) / 1000.0) << "\n"
<< indent_str(indent) << "compute-capability: " << compute_capability() << "\n";
<< indent_str(indent) << "devices: ";
for (int device : devices) {
out << device << ',';
}
out
<< "\n"
<< indent_str(indent) << "clock: " << int(double(properties[0].clockRate) / 1000.0) << "\n"
<< indent_str(indent) << "compute-capability: " << compute_capability(0) << "\n";
}
/// Returns the device ID from a device index
int Options::Device::device_id(size_t device_index) const {
if (device_index > devices.size()) {
throw std::runtime_error("Out of bounds device index: " +
std::to_string(device_index));
}
return devices.at(device_index);
}
/// Returns the compute capability of the listed device (e.g. 61, 60, 70, 75)
int Options::Device::compute_capability() const {
return properties.major * 10 + properties.minor;
int Options::Device::compute_capability(int device_index) const {
return properties[device_index].major * 10 + properties[device_index].minor;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
@ -207,10 +267,10 @@ Options::Initialization::Initialization(cutlass::CommandLine const &cmdline) {
else {
// profiler chosen data distribution (allowed to change based on numeric types)
fix_data_distribution = false;
// set uniform data distribution with range [-4, 4]
// set uniform data distribution with range [-4, 4]
data_distribution.set_uniform(-4, 4, 0);
}
}
@ -248,10 +308,10 @@ void Options::Initialization::get_distribution(
};
// Initalize pnz values to a default value of 100%
dist.gaussian.pnz = 100.0;
dist.gaussian.pnzA = 100.0;
dist.gaussian.pnzB = 100.0;
dist.gaussian.pnzC = 100.0;
dist.gaussian.pnz = 1.0;
dist.gaussian.pnzA = 1.0;
dist.gaussian.pnzB = 1.0;
dist.gaussian.pnzC = 1.0;
using KeyValueVector = std::vector<std::pair<std::string, std::string> >;
@ -335,7 +395,7 @@ Options::Library::Library(cutlass::CommandLine const &cmdline) {
std::string mode = "default";
cmdline.get_cmd_line_argument("library-algo-mode", mode);
algorithm_mode = from_string<AlgorithmMode>(mode);
}
}
if (cmdline.check_cmd_line_flag("library-algos")) {
@ -353,7 +413,7 @@ Options::Library::Library(cutlass::CommandLine const &cmdline) {
}
else {
int algo;
std::stringstream ss;
std::stringstream ss;
ss << token;
ss >> algo;
@ -396,12 +456,12 @@ void Options::Library::print_options(std::ostream &out, int indent) const {
Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {
cmdline.get_cmd_line_argument("workspace-count", workspace_count, 0);
cmdline.get_cmd_line_argument("workspace-count", workspace_count, 0);
cmdline.get_cmd_line_argument("warmup-iterations", warmup_iterations, 10);
cmdline.get_cmd_line_argument("profiling-iterations", iterations, 100);
cmdline.get_cmd_line_argument("sleep-duration", sleep_duration, 50);
cmdline.get_cmd_line_argument("profiling-enabled", enabled, true);
if (cmdline.check_cmd_line_flag("providers")) {
std::vector<std::string> tokens;
@ -416,7 +476,7 @@ Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {
else {
providers.push_back(library::Provider::kCUTLASS);
providers.push_back(library::Provider::kCUBLAS);
providers.push_back(library::Provider::kCUDNN);
providers.push_back(library::Provider::kCUDNN);
}
}
@ -480,7 +540,7 @@ size_t Options::Profiling::index(library::Provider provider) const {
/////////////////////////////////////////////////////////////////////////////////////////////////
Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
cmdline.get_cmd_line_argument("verification-enabled", enabled, true);
if (enabled) {
cmdline.get_cmd_line_argument("verification-required", required, false);
@ -500,7 +560,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
}
if (cmdline.check_cmd_line_flag("verification-providers")) {
std::vector<std::string> tokens;
cmdline.get_cmd_line_arguments("verification-providers", tokens);
@ -516,7 +576,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
else {
providers.push_back(library::Provider::kCUBLAS);
providers.push_back(library::Provider::kReferenceDevice);
providers.push_back(library::Provider::kCUDNN);
providers.push_back(library::Provider::kCUDNN);
}
}
@ -583,11 +643,11 @@ size_t Options::Verification::index(library::Provider provider) const {
/////////////////////////////////////////////////////////////////////////////////////////////////
Options::Report::Report(cutlass::CommandLine const &cmdline) {
cmdline.get_cmd_line_argument("append", append, false);
cmdline.get_cmd_line_argument("output", output_path);
cmdline.get_cmd_line_argument("junit-output", junit_output_path);
if (cmdline.check_cmd_line_flag("tags")) {
cmdline.get_cmd_line_argument_pairs("tags", pivot_tags);
}
@ -687,11 +747,11 @@ Options::Options(cutlass::CommandLine const &cmdline):
device(cmdline),
initialization(cmdline),
library(cmdline),
profiling(cmdline),
verification(cmdline),
profiling(cmdline),
verification(cmdline),
report(cmdline),
about(cmdline) {
if (cmdline.check_cmd_line_flag("mode")) {
std::string token;
cmdline.get_cmd_line_argument("mode", token);

View File

@ -94,7 +94,7 @@ PerformanceReport::PerformanceReport(
if (options_.report.append) {
std::ifstream test_output_file(op_file_name_);
if (test_output_file.is_open()) {
print_header = false;
test_output_file.close();
@ -145,7 +145,7 @@ void PerformanceReport::append_result(PerformanceResult result) {
if (options_.report.verbose) {
std::cout << "\n";
print_result_pretty_(std::cout, result) << std::flush;
print_result_pretty_(std::cout, result) << std::flush;
}
if (junit_output_file_.is_open()) {
@ -237,7 +237,7 @@ static const char *disposition_status_color(Disposition disposition) {
/// Prints the result in human readable form
std::ostream & PerformanceReport::print_result_pretty_(
std::ostream &out,
std::ostream &out,
PerformanceResult const &result,
bool use_shell_coloring) {
@ -251,14 +251,14 @@ std::ostream & PerformanceReport::print_result_pretty_(
int column_idx = 0;
for (auto const & tag : options_.report.pivot_tags) {
out << (column_idx++ ? "," : "") << tag.first << ":" << tag.second;
}
}
out << "\n";
}
std::string shell_color_bright = use_shell_coloring ? SHELL_COLOR_BRIGHT() : "";
std::string shell_color_end = use_shell_coloring ? SHELL_COLOR_END() : "";
auto _disposition_status_color = [&](Disposition d) -> const char * {
auto _disposition_status_color = [&](Disposition d) -> const char * {
return use_shell_coloring ? disposition_status_color(d) : "";
};
@ -277,7 +277,7 @@ std::ostream & PerformanceReport::print_result_pretty_(
static int const indent_spaces = 16;
for(auto & m : result.verification_map) {
out << std::right << std::setw(indent_spaces) << library::to_string(m.first, true) << ": " << to_string(m.second, true) << "\n";
out << std::right << std::setw(indent_spaces) << library::to_string(m.first, true) << ": " << to_string(m.second, true) << "\n";
}
}
@ -287,7 +287,7 @@ std::ostream & PerformanceReport::print_result_pretty_(
int column_idx = 0;
for (auto const &arg : result.arguments) {
if (!arg.second.empty()) {
out << " --" << arg.first << "=" << arg.second;
out << " --" << arg.first << "=" << arg.second;
column_idx += int(4 + arg.first.size() + arg.second.size());
if (column_idx > 98) {
out << " \\\n ";
@ -297,7 +297,7 @@ std::ostream & PerformanceReport::print_result_pretty_(
}
out << "\n\n";
out
out
<< " Bytes: " << result.bytes << " bytes\n"
<< " FLOPs: " << result.flops << " flops\n"
<< " FLOPs/Byte: " << (result.flops / result.bytes) << "\n\n";
@ -325,7 +325,7 @@ std::ostream & PerformanceReport::print_csv_header_(
out << (column_idx++ ? "," : "") << tag.first;
}
out
out
<< (column_idx ? "," : "") << "Problem,Provider"
<< ",OperationKind,Operation,Disposition,Status";
@ -333,7 +333,7 @@ std::ostream & PerformanceReport::print_csv_header_(
out << "," << arg_name;
}
out
out
<< ",Bytes"
<< ",Flops"
<< ",Flops/Byte"
@ -347,7 +347,7 @@ std::ostream & PerformanceReport::print_csv_header_(
/// Print the result in CSV output
std::ostream & PerformanceReport::print_result_csv_(
std::ostream &out,
std::ostream &out,
PerformanceResult const &result) {
int column_idx = 0;
@ -357,8 +357,8 @@ std::ostream & PerformanceReport::print_result_csv_(
out << (column_idx++ ? "," : "") << tag.second;
}
out
<< (column_idx ? "," : "")
out
<< (column_idx ? "," : "")
<< result.problem_index
<< "," << to_string(result.provider, true)
<< "," << to_string(result.op_kind)
@ -370,7 +370,7 @@ std::ostream & PerformanceReport::print_result_csv_(
out << "," << arg.second;
}
out
out
<< "," << result.bytes
<< "," << result.flops
<< "," << result.flops / result.bytes
@ -387,7 +387,7 @@ std::ostream & PerformanceReport::print_result_csv_(
else {
out << std::string(2
, ','
);
);
}
return out;
@ -451,25 +451,25 @@ std::ostream & PerformanceReport::print_junit_result_(std::ostream &out, Perform
case Disposition::kNotSupported:
skipped = true;
break;
case Disposition::kPassed:
case Disposition::kPassed:
case Disposition::kNotVerified:
break;
case Disposition::kFailed:
case Disposition::kFailed:
case Disposition::kIncorrect:
failed = true;
failed = true;
break;
case Disposition::kInvalidProblem:
case Disposition::kInvalid:
error = true;
break;
};
if (skipped) {
out << "status=\"notrun\"";
} else {
out << "status=\"run\"";
}
out << ">" << std::endl;
if (failed) {
@ -488,7 +488,7 @@ std::ostream & PerformanceReport::print_junit_result_(std::ostream &out, Perform
out << " </testcase>" << std::endl;
return out;
return out;
}

View File

@ -31,7 +31,7 @@
/* \file
\brief Execution environment
*/
#include <iostream>
@ -54,7 +54,7 @@ namespace profiler {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Ctor
Rank2KOperationProfiler::Rank2KOperationProfiler(Options const &options):
Rank2KOperationProfiler::Rank2KOperationProfiler(Options const &options):
OperationProfiler(
options,
library::OperationKind::kRank2K,
@ -95,7 +95,7 @@ void Rank2KOperationProfiler::print_examples(std::ostream &out) const {
out << "\nExamples:\n\n"
<< "Profile a particular problem size Syrk kernel:\n"
<< " $ cutlass_profiler --operation=rank_2k --blas_mode=symmetric --n=1024 --k=128\n\n"
<< "Profile a particular problem size Herk kernel:\n"
<< " $ cutlass_profiler --operation=rank_2k --blas_mode=hermitian --n=1024 --k=128\n\n"
@ -118,7 +118,7 @@ void Rank2KOperationProfiler::print_examples(std::ostream &out) const {
<< "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
<< " $ cutlass_profiler --operation=rank_2k --cta_m=256 --cta_n=128 --cta_k=32 --save-workspace=incorrect\n\n"
<< "Test your changes to rank_2k kernels with a quick functional test and save results in functional-test.csv:\n"
<< " $ cutlass_profiler --operation=rank_2k \\ \n"
<< " --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@ -148,22 +148,22 @@ Status Rank2KOperationProfiler::RankKProblem::parse(
library::RankKDescription const &operation_desc,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
if (!arg_as_int(this->n, "n", problem_space, problem)) {
// default value
this->n = 1024;
}
if (!arg_as_int(this->k, "k", problem_space, problem)) {
// default value
this->k = 1024;
}
if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
// default value
this->split_k_slices = 1;
}
if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
// default value
this->batch_count = 1;
@ -187,29 +187,29 @@ Status Rank2KOperationProfiler::RankKProblem::parse(
}
if (!arg_as_scalar(
this->alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
this->alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
problem)) {
if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
return Status::kErrorInternal;
}
}
if (!arg_as_scalar(
this->beta,
operation_desc.element_epilogue,
"beta",
problem_space,
this->beta,
operation_desc.element_epilogue,
"beta",
problem_space,
problem)) {
if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
return Status::kErrorInternal;
}
}
this->lda = DeviceAllocation::get_packed_layout(
operation_desc.A.layout, {int(this->n), int(this->k)}).front();
@ -311,14 +311,14 @@ void Rank2KOperationProfiler::RankKProblem::initialize_result(
/// Extracts the problem dimensions
Status Rank2KOperationProfiler::initialize_configuration(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::RankKDescription const &operation_desc =
library::RankKDescription const &operation_desc =
static_cast<library::RankKDescription const &>(operation->description());
if (operation_desc.rank_k_kind != library::RankKKind::kUniversal) {
@ -326,7 +326,7 @@ Status Rank2KOperationProfiler::initialize_configuration(
}
Status status = problem_.parse(operation_desc, problem_space, problem);
if (status != Status::kSuccess) {
return status;
}
@ -350,14 +350,14 @@ Status Rank2KOperationProfiler::initialize_configuration(
rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
initialize_result_(this->model_result_, options, operation_desc, problem_space);
return operation->can_implement(&rank_k_workspace_.configuration, &rank_k_workspace_.arguments);
}
/// Initializes the performance result
void Rank2KOperationProfiler::initialize_result_(
PerformanceResult &result,
Options const &options,
Options const &options,
library::RankKDescription const &operation_desc,
ProblemSpace const &problem_space) {
@ -365,7 +365,7 @@ void Rank2KOperationProfiler::initialize_result_(
result.disposition = Disposition::kNotRun;
result.status = Status::kSuccess;
result.operation_name = operation_desc.name;
problem_.initialize_result(result, operation_desc, problem_space);
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
@ -380,19 +380,30 @@ void Rank2KOperationProfiler::initialize_result_(
/// Initializes workspace
Status Rank2KOperationProfiler::initialize_workspace(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::RankKDescription const &operation_desc =
if (options.device.devices.size() != 1) {
throw std::runtime_error("This operation profiler only supports a single "
"device.");
}
cudaError_t result;
result = cudaSetDevice(options.device.device_id(0));
if (result != cudaSuccess) {
throw std::runtime_error("cudaSetDevice() failed.");
}
library::RankKDescription const &operation_desc =
static_cast<library::RankKDescription const &>(operation->description());
if (options.execution_mode != ExecutionMode::kDryRun) {
int seed_shift = 0;
rank_k_workspace_.A = device_context.allocate_tensor(
rank_k_workspace_.A = device_context.allocate_and_initialize_tensor(
options,
"A",
operation_desc.A.element,
@ -400,10 +411,11 @@ Status Rank2KOperationProfiler::initialize_workspace(
{int(problem_.n), int(problem_.k)},
{int(problem_.lda)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
rank_k_workspace_.B = device_context.allocate_tensor(
rank_k_workspace_.B = device_context.allocate_and_initialize_tensor(
options,
"B",
operation_desc.B.element,
@ -411,10 +423,11 @@ Status Rank2KOperationProfiler::initialize_workspace(
{int(problem_.n), int(problem_.k)},
{int(problem_.ldb)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
rank_k_workspace_.C = device_context.allocate_tensor(
rank_k_workspace_.C = device_context.allocate_and_initialize_tensor(
options,
"C",
operation_desc.C.element,
@ -422,23 +435,30 @@ Status Rank2KOperationProfiler::initialize_workspace(
{int(problem_.n), int(problem_.n)},
{int(problem_.ldc)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
rank_k_workspace_.Computed = device_context.allocate_tensor(
options,
"D",
operation_desc.C.element,
operation_desc.C.layout,
{int(problem_.n), int(problem_.n)},
{int(problem_.ldc)}
{int(problem_.ldc)},
1, // batch_count
0 // device_index
);
rank_k_workspace_.Reference = device_context.allocate_tensor(
options,
"Reference",
operation_desc.C.element,
operation_desc.C.layout,
{int(problem_.n), int(problem_.n)},
{int(problem_.ldc)}
{int(problem_.ldc)},
1, // batch_count
0 // device_index
);
rank_k_workspace_.Computed->copy_from_device(rank_k_workspace_.C->data());
@ -487,7 +507,7 @@ Status Rank2KOperationProfiler::initialize_workspace(
/// Verifies CUTLASS against references
bool Rank2KOperationProfiler::verify_cutlass(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -516,7 +536,7 @@ bool Rank2KOperationProfiler::verify_cutlass(
//
results_.back().status = operation->run(
&rank_k_workspace_.arguments,
&rank_k_workspace_.arguments,
rank_k_workspace_.host_workspace.data(),
rank_k_workspace_.device_workspace.data());
@ -564,8 +584,8 @@ bool Rank2KOperationProfiler::verify_cutlass(
}
}
#endif // #if CUTLASS_ENABLE_CUBLAS
// Update disposition to worst case verification outcome among all
// Update disposition to worst case verification outcome among all
// verification providers which are supported
bool is_any_verification_run_passed = false;
for(auto &m : results_.back().verification_map) {
@ -591,7 +611,7 @@ bool Rank2KOperationProfiler::verify_cutlass(
/// Verifies CUTLASS against references
bool Rank2KOperationProfiler::verify_with_cublas_(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -601,13 +621,13 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
#if CUTLASS_ENABLE_CUBLAS
library::RankKDescription const &rank_k_desc =
library::RankKDescription const &rank_k_desc =
static_cast<library::RankKDescription const &>(operation->description());
//
// Construct cuBLAS operators
//
CublasCreate handle;
cublasStatus_t status = handle.get_cublas_create_status();
@ -636,8 +656,8 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
rank_k_workspace_.arguments.beta = problem_.beta.data();
rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
detail::cublasRankKDispatcher rank_k_op(
rank_k_desc,
detail::cublasRankKDispatcher rank_k_op(
rank_k_desc,
rank_k_workspace_.configuration,
rank_k_workspace_.arguments
);
@ -669,7 +689,7 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
);
// Save workspace if incorrect
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
save_workspace(
@ -694,7 +714,7 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
/// Measures performance results
bool Rank2KOperationProfiler::profile(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,

View File

@ -31,7 +31,7 @@
/* \file
\brief Execution environment
*/
#include <iostream>
@ -54,7 +54,7 @@ namespace profiler {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Ctor
RankKOperationProfiler::RankKOperationProfiler(Options const &options):
RankKOperationProfiler::RankKOperationProfiler(Options const &options):
OperationProfiler(
options,
library::OperationKind::kRankK,
@ -94,7 +94,7 @@ void RankKOperationProfiler::print_examples(std::ostream &out) const {
out << "\nExamples:\n\n"
<< "Profile a particular problem size Syrk kernel:\n"
<< " $ cutlass_profiler --operation=rank_k --blas_mode=symmetric --n=1024 --k=128\n\n"
<< "Profile a particular problem size Herk kernel:\n"
<< " $ cutlass_profiler --operation=rank_k --blas_mode=hermitian --n=1024 --k=128\n\n"
@ -117,7 +117,7 @@ void RankKOperationProfiler::print_examples(std::ostream &out) const {
<< "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
<< " $ cutlass_profiler --operation=rank_k --cta_m=256 --cta_n=128 --cta_k=32 --save-workspace=incorrect\n\n"
<< "Test your changes to rank_k kernels with a quick functional test and save results in functional-test.csv:\n"
<< " $ cutlass_profiler --operation=rank_k \\ \n"
<< " --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@ -147,22 +147,22 @@ Status RankKOperationProfiler::RankKProblem::parse(
library::RankKDescription const &operation_desc,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
if (!arg_as_int(this->n, "n", problem_space, problem)) {
// default value
this->n = 1024;
}
if (!arg_as_int(this->k, "k", problem_space, problem)) {
// default value
this->k = 1024;
}
if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
// default value
this->split_k_slices = 1;
}
if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
// default value
this->batch_count = 1;
@ -182,29 +182,29 @@ Status RankKOperationProfiler::RankKProblem::parse(
}
if (!arg_as_scalar(
this->alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
this->alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
problem)) {
if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
return Status::kErrorInternal;
}
}
if (!arg_as_scalar(
this->beta,
operation_desc.element_epilogue,
"beta",
problem_space,
this->beta,
operation_desc.element_epilogue,
"beta",
problem_space,
problem)) {
if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
return Status::kErrorInternal;
}
}
this->lda = DeviceAllocation::get_packed_layout(
operation_desc.A.layout, {int(this->n), int(this->k)}).front();
@ -252,7 +252,7 @@ int64_t RankKOperationProfiler::RankKProblem::flops(library::RankKDescription co
case library::MathOperationID::kMultiplyAddComplexFastF32:
flops_ *= 4;
break;
case library::MathOperationID::kMultiplyAddGaussianComplex:
flops_ *= 3;
break;
@ -300,14 +300,14 @@ void RankKOperationProfiler::RankKProblem::initialize_result(
/// Extracts the problem dimensions
Status RankKOperationProfiler::initialize_configuration(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::RankKDescription const &operation_desc =
library::RankKDescription const &operation_desc =
static_cast<library::RankKDescription const &>(operation->description());
if (operation_desc.rank_k_kind != library::RankKKind::kUniversal) {
@ -315,7 +315,7 @@ Status RankKOperationProfiler::initialize_configuration(
}
Status status = problem_.parse(operation_desc, problem_space, problem);
if (status != Status::kSuccess) {
return status;
}
@ -337,14 +337,14 @@ Status RankKOperationProfiler::initialize_configuration(
rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
initialize_result_(this->model_result_, options, operation_desc, problem_space);
return operation->can_implement(&rank_k_workspace_.configuration, &rank_k_workspace_.arguments);
}
/// Initializes the performance result
void RankKOperationProfiler::initialize_result_(
PerformanceResult &result,
Options const &options,
Options const &options,
library::RankKDescription const &operation_desc,
ProblemSpace const &problem_space) {
@ -352,7 +352,7 @@ void RankKOperationProfiler::initialize_result_(
result.disposition = Disposition::kNotRun;
result.status = Status::kSuccess;
result.operation_name = operation_desc.name;
problem_.initialize_result(result, operation_desc, problem_space);
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
@ -368,7 +368,7 @@ void RankKOperationProfiler::initialize_result_(
case library::MathOperationID::kMultiplyAddComplex:
result.flops *= 4;
break;
case library::MathOperationID::kMultiplyAddComplexFastF32:
result.flops *= 4;
break;
@ -380,19 +380,30 @@ void RankKOperationProfiler::initialize_result_(
/// Initializes workspace
Status RankKOperationProfiler::initialize_workspace(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::RankKDescription const &operation_desc =
if (options.device.devices.size() != 1) {
throw std::runtime_error("This operation profiler only supports a single "
"device.");
}
cudaError_t result;
result = cudaSetDevice(options.device.device_id(0));
if (result != cudaSuccess) {
throw std::runtime_error("cudaSetDevice() failed.");
}
library::RankKDescription const &operation_desc =
static_cast<library::RankKDescription const &>(operation->description());
if (options.execution_mode != ExecutionMode::kDryRun) {
int seed_shift = 0;
rank_k_workspace_.A = device_context.allocate_tensor(
rank_k_workspace_.A = device_context.allocate_and_initialize_tensor(
options,
"A",
operation_desc.A.element,
@ -400,10 +411,11 @@ Status RankKOperationProfiler::initialize_workspace(
{int(problem_.n), int(problem_.k)},
{int(problem_.lda)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
rank_k_workspace_.C = device_context.allocate_tensor(
rank_k_workspace_.C = device_context.allocate_and_initialize_tensor(
options,
"C",
operation_desc.C.element,
@ -411,23 +423,30 @@ Status RankKOperationProfiler::initialize_workspace(
{int(problem_.n), int(problem_.n)},
{int(problem_.ldc)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
rank_k_workspace_.Computed = device_context.allocate_tensor(
options,
"D",
operation_desc.C.element,
operation_desc.C.layout,
{int(problem_.n), int(problem_.n)},
{int(problem_.ldc)}
{int(problem_.ldc)},
1, //batch_count
0 // device_index
);
rank_k_workspace_.Reference = device_context.allocate_tensor(
options,
"Reference",
operation_desc.C.element,
operation_desc.C.layout,
{int(problem_.n), int(problem_.n)},
{int(problem_.ldc)}
{int(problem_.ldc)},
1, //batch_count
0 // device_index
);
rank_k_workspace_.Computed->copy_from_device(rank_k_workspace_.C->data());
@ -476,7 +495,7 @@ Status RankKOperationProfiler::initialize_workspace(
/// Verifies CUTLASS against references
bool RankKOperationProfiler::verify_cutlass(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -504,7 +523,7 @@ bool RankKOperationProfiler::verify_cutlass(
//
results_.back().status = operation->run(
&rank_k_workspace_.arguments,
&rank_k_workspace_.arguments,
rank_k_workspace_.host_workspace.data(),
rank_k_workspace_.device_workspace.data());
@ -552,8 +571,8 @@ bool RankKOperationProfiler::verify_cutlass(
}
}
#endif // #if CUTLASS_ENABLE_CUBLAS
// Update disposition to worst case verification outcome among all
// Update disposition to worst case verification outcome among all
// verification providers which are supported
bool is_any_verification_run_passed = false;
for(auto &m : results_.back().verification_map) {
@ -579,7 +598,7 @@ bool RankKOperationProfiler::verify_cutlass(
/// Verifies CUTLASS against references
bool RankKOperationProfiler::verify_with_cublas_(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -589,13 +608,13 @@ bool RankKOperationProfiler::verify_with_cublas_(
#if CUTLASS_ENABLE_CUBLAS
library::RankKDescription const &rank_k_desc =
library::RankKDescription const &rank_k_desc =
static_cast<library::RankKDescription const &>(operation->description());
//
// Construct cuBLAS operators
//
CublasCreate handle;
cublasStatus_t status = handle.get_cublas_create_status();
@ -623,8 +642,8 @@ bool RankKOperationProfiler::verify_with_cublas_(
rank_k_workspace_.arguments.beta = problem_.beta.data();
rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
detail::cublasRankKDispatcher rank_k_op(
rank_k_desc,
detail::cublasRankKDispatcher rank_k_op(
rank_k_desc,
rank_k_workspace_.configuration,
rank_k_workspace_.arguments
);
@ -656,7 +675,7 @@ bool RankKOperationProfiler::verify_with_cublas_(
);
// Save workspace if incorrect
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
save_workspace(
@ -681,7 +700,7 @@ bool RankKOperationProfiler::verify_with_cublas_(
/// Measures performance results
bool RankKOperationProfiler::profile(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,

View File

@ -51,23 +51,23 @@ namespace profiler {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Ctor
SparseGemmOperationProfiler::SparseGemmOperationProfiler(Options const &options):
SparseGemmOperationProfiler::SparseGemmOperationProfiler(Options const &options):
OperationProfiler(
options,
library::OperationKind::kSparseGemm,
{
{ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (e.g. sparse, ...)"},
{ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"},
{ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"},
{ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"},
{ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
{ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
{ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
{ArgumentTypeID::kTensor, {"E"}, "Tensor storing the E operand"},
{ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
{ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
{ArgumentTypeID::kInteger, {"split_k_slices"}, "Number of partitions of K dimension"},
{ArgumentTypeID::kInteger, {"batch_count"}, "Number of GEMMs computed in one batch"},
{ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (e.g. sparse, ...)"},
{ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"},
{ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"},
{ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"},
{ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
{ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
{ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
{ArgumentTypeID::kTensor, {"E"}, "Tensor storing the E operand"},
{ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
{ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
{ArgumentTypeID::kInteger, {"split_k_slices"}, "Number of partitions of K dimension"},
{ArgumentTypeID::kInteger, {"batch_count"}, "Number of GEMMs computed in one batch"},
}
) {
@ -109,7 +109,7 @@ void SparseGemmOperationProfiler::print_examples(std::ostream &out) const {
<< "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
<< " $ cutlass_profiler --operation=SparseGemm --cta_m=256 --cta_n=128 --cta_k=32 --save-workspace=incorrect\n\n"
<< "Test your changes to gemm kernels with a quick functional test and save results in functional-test.csv:\n"
<< " $ cutlass_profiler --operation=SparseGemm \\ \n"
<< " --m=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@ -125,7 +125,7 @@ Status SparseGemmOperationProfiler::SparseGemmProblem::parse(
library::SparseGemmDescription const &operation_desc,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
if (!arg_as_int(this->m, "m", problem_space, problem)) {
// default value
this->m = 1024;
@ -135,17 +135,17 @@ Status SparseGemmOperationProfiler::SparseGemmProblem::parse(
// default value
this->n = 1024;
}
if (!arg_as_int(this->k, "k", problem_space, problem)) {
// default value
this->k = 1024;
}
if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
// default value
this->split_k_slices = 1;
}
if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
// default value
this->batch_count = 1;
@ -168,24 +168,24 @@ Status SparseGemmOperationProfiler::SparseGemmProblem::parse(
}
if (!arg_as_scalar(
this->alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
this->alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
problem)) {
if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
return Status::kErrorInternal;
}
}
if (!arg_as_scalar(
this->beta,
operation_desc.element_epilogue,
"beta",
problem_space,
this->beta,
operation_desc.element_epilogue,
"beta",
problem_space,
problem)) {
if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
return Status::kErrorInternal;
}
@ -252,14 +252,14 @@ void SparseGemmOperationProfiler::SparseGemmProblem::initialize_result(
/// Extracts the problem dimensions
Status SparseGemmOperationProfiler::initialize_configuration(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::SparseGemmDescription const &operation_desc =
library::SparseGemmDescription const &operation_desc =
static_cast<library::SparseGemmDescription const &>(operation->description());
if (operation_desc.gemm_kind != library::GemmKind::kSparse) {
@ -291,14 +291,14 @@ Status SparseGemmOperationProfiler::initialize_configuration(
gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
initialize_result_(this->model_result_, options, operation_desc, problem_space);
return operation->can_implement(&gemm_workspace_.configuration, &gemm_workspace_.arguments);
}
/// Initializes the performance result
void SparseGemmOperationProfiler::initialize_result_(
PerformanceResult &result,
Options const &options,
Options const &options,
library::SparseGemmDescription const &operation_desc,
ProblemSpace const &problem_space) {
@ -308,7 +308,7 @@ void SparseGemmOperationProfiler::initialize_result_(
result.operation_name = operation_desc.name;
problem_.initialize_result(result, operation_desc, problem_space);
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
// Input bytes read and Output bytes written for the gemm problem
@ -337,19 +337,30 @@ void SparseGemmOperationProfiler::initialize_result_(
/// Initializes workspace
Status SparseGemmOperationProfiler::initialize_workspace(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::SparseGemmDescription const &operation_desc =
if (options.device.devices.size() != 1) {
throw std::runtime_error("This operation profiler only supports a single "
"device.");
}
cudaError_t result;
result = cudaSetDevice(options.device.device_id(0));
if (result != cudaSuccess) {
throw std::runtime_error("cudaSetDevice() failed.");
}
library::SparseGemmDescription const &operation_desc =
static_cast<library::SparseGemmDescription const &>(operation->description());
if (options.execution_mode != ExecutionMode::kDryRun) {
int seed_shift = 0;
gemm_workspace_.A = device_context.allocate_tensor(
gemm_workspace_.A = device_context.allocate_and_initialize_tensor(
options,
"A",
operation_desc.A.element,
@ -357,10 +368,11 @@ Status SparseGemmOperationProfiler::initialize_workspace(
{int(problem_.m), int(problem_.k) / int(problem_.sparse)},
{int(problem_.lda)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
gemm_workspace_.B = device_context.allocate_tensor(
gemm_workspace_.B = device_context.allocate_and_initialize_tensor(
options,
"B",
operation_desc.B.element,
@ -368,10 +380,11 @@ Status SparseGemmOperationProfiler::initialize_workspace(
{int(problem_.k), int(problem_.n)},
{int(problem_.ldb)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
gemm_workspace_.C = device_context.allocate_tensor(
gemm_workspace_.C = device_context.allocate_and_initialize_tensor(
options,
"C",
operation_desc.C.element,
@ -379,18 +392,22 @@ Status SparseGemmOperationProfiler::initialize_workspace(
{int(problem_.m), int(problem_.n)},
{int(problem_.ldc)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
gemm_workspace_.Computed = device_context.allocate_tensor(
options,
"D",
operation_desc.C.element,
operation_desc.C.layout,
{int(problem_.m), int(problem_.n)},
{int(problem_.ldc)}
{int(problem_.ldc)},
1, // batch_count
0 // device_index
);
gemm_workspace_.E = device_context.allocate_sparsemeta_tensor(
gemm_workspace_.E = device_context.allocate_and_initialize_sparsemeta_tensor(
options,
"E",
operation_desc.E.element,
@ -399,15 +416,19 @@ Status SparseGemmOperationProfiler::initialize_workspace(
{int(problem_.m), int(problem_.k) / int(problem_.sparse) / int(problem_.elements_per_128b)},
{int(problem_.lde)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
gemm_workspace_.Reference = device_context.allocate_tensor(
options,
"Reference",
operation_desc.C.element,
operation_desc.C.layout,
{int(problem_.m), int(problem_.n)},
{int(problem_.ldc)}
{int(problem_.ldc)},
1, // batch_count
0 // device_index
);
gemm_workspace_.Reference->copy_from_device(gemm_workspace_.C->data());
@ -456,7 +477,7 @@ Status SparseGemmOperationProfiler::initialize_workspace(
/// Verifies CUTLASS against references
bool SparseGemmOperationProfiler::verify_cutlass(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -486,7 +507,7 @@ bool SparseGemmOperationProfiler::verify_cutlass(
//
results_.back().status = operation->run(
&gemm_workspace_.arguments,
&gemm_workspace_.arguments,
gemm_workspace_.host_workspace.data(),
gemm_workspace_.device_workspace.data());
@ -510,7 +531,7 @@ bool SparseGemmOperationProfiler::verify_cutlass(
if (options.verification.enabled) {
// Update disposition to worst case verification outcome among all
// Update disposition to worst case verification outcome among all
// verification providers which are supported
bool is_any_verification_run_passed = false;
@ -537,7 +558,7 @@ bool SparseGemmOperationProfiler::verify_cutlass(
/// Measures performance results
bool SparseGemmOperationProfiler::profile(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -565,7 +586,7 @@ bool SparseGemmOperationProfiler::profile(
gemm_workspace_.device_workspace.data()
);
}
return true;
}

View File

@ -31,7 +31,7 @@
/* \file
\brief Execution environment
*/
#include <iostream>
@ -54,7 +54,7 @@ namespace profiler {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Ctor
SymmOperationProfiler::SymmOperationProfiler(Options const &options):
SymmOperationProfiler::SymmOperationProfiler(Options const &options):
OperationProfiler(
options,
library::OperationKind::kSymm,
@ -96,7 +96,7 @@ void SymmOperationProfiler::print_examples(std::ostream &out) const {
out << "\nExamples:\n\n"
<< "Profile a particular problem size SYMM kernel:\n"
<< " $ cutlass_profiler --operation=Symm --blas_mode=symmetric --m=1024 --n=128\n\n"
<< "Profile a particular problem size HEMM kernel:\n"
<< " $ cutlass_profiler --operation=Symm --blas_mode=hermitian --m=1024 --n=128\n\n"
@ -122,7 +122,7 @@ void SymmOperationProfiler::print_examples(std::ostream &out) const {
<< "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
<< " $ cutlass_profiler --operation=Symm --cta_m=256 --cta_n=128 --cta_k=32 --save-workspace=incorrect\n\n"
<< "Test your changes to symm kernels with a quick functional test and save results in functional-test.csv:\n"
<< " $ cutlass_profiler --operation=Symm \\ \n"
<< " --m=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@ -152,22 +152,22 @@ Status SymmOperationProfiler::SymmProblem::parse(
library::SymmDescription const &operation_desc,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
if (!arg_as_int(this->m, "m", problem_space, problem)) {
// default value
this->m = 1024;
}
if (!arg_as_int(this->n, "n", problem_space, problem)) {
// default value
this->n = 1024;
}
if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
// default value
this->split_k_slices = 1;
}
if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
// default value
this->batch_count = 1;
@ -191,29 +191,29 @@ Status SymmOperationProfiler::SymmProblem::parse(
}
if (!arg_as_scalar(
this->alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
this->alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
problem)) {
if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
return Status::kErrorInternal;
}
}
if (!arg_as_scalar(
this->beta,
operation_desc.element_epilogue,
"beta",
problem_space,
this->beta,
operation_desc.element_epilogue,
"beta",
problem_space,
problem)) {
if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
return Status::kErrorInternal;
}
}
if (operation_desc.side_mode == SideMode::kLeft) {
this->lda = DeviceAllocation::get_packed_layout(
operation_desc.A.layout, {int(this->m), int(this->m)}).front();
@ -240,12 +240,12 @@ int64_t SymmOperationProfiler::SymmProblem::bytes(library::SymmDescription const
if (operation_desc.side_mode == SideMode::kLeft) {
bytes =
int64_t(library::sizeof_bits(operation_desc.A.element) * m / 8) * (m + 1) / 2 +
int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n +
int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n +
int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
} else if (operation_desc.side_mode == SideMode::kRight) {
bytes =
int64_t(library::sizeof_bits(operation_desc.A.element) * n / 8) * (n + 1) / 2 +
int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n +
int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n +
int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
}
// Set is_beta_zero true if beta is zero
@ -277,7 +277,7 @@ int64_t SymmOperationProfiler::SymmProblem::flops(library::SymmDescription const
case library::MathOperationID::kMultiplyAddComplex:
flops_ *= 4;
break;
case library::MathOperationID::kMultiplyAddComplexFastF32:
flops_ *= 4;
break;
@ -334,14 +334,14 @@ void SymmOperationProfiler::SymmProblem::initialize_result(
/// Extracts the problem dimensions
Status SymmOperationProfiler::initialize_configuration(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::SymmDescription const &operation_desc =
library::SymmDescription const &operation_desc =
static_cast<library::SymmDescription const &>(operation->description());
if (operation_desc.symm_kind != library::SymmKind::kUniversal) {
@ -349,14 +349,14 @@ Status SymmOperationProfiler::initialize_configuration(
}
Status status = problem_.parse(operation_desc, problem_space, problem);
if (status != Status::kSuccess) {
return status;
}
symm_workspace_.configuration.problem_size.m() = int(problem_.m);
symm_workspace_.configuration.problem_size.n() = int(problem_.n);
symm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft)
symm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft)
? int(problem_.m) : int(problem_.n);
symm_workspace_.configuration.lda = problem_.lda;
symm_workspace_.configuration.ldb = problem_.ldb;
@ -374,14 +374,14 @@ Status SymmOperationProfiler::initialize_configuration(
symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
initialize_result_(this->model_result_, options, operation_desc, problem_space);
return operation->can_implement(&symm_workspace_.configuration, &symm_workspace_.arguments);
}
/// Initializes the performance result
void SymmOperationProfiler::initialize_result_(
PerformanceResult &result,
Options const &options,
Options const &options,
library::SymmDescription const &operation_desc,
ProblemSpace const &problem_space) {
@ -389,7 +389,7 @@ void SymmOperationProfiler::initialize_result_(
result.disposition = Disposition::kNotRun;
result.status = Status::kSuccess;
result.operation_name = operation_desc.name;
problem_.initialize_result(result, operation_desc, problem_space);
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
@ -404,20 +404,31 @@ void SymmOperationProfiler::initialize_result_(
/// Initializes workspace
Status SymmOperationProfiler::initialize_workspace(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::SymmDescription const &operation_desc =
if (options.device.devices.size() != 1) {
throw std::runtime_error("This operation profiler only supports a single "
"device.");
}
cudaError_t result;
result = cudaSetDevice(options.device.device_id(0));
if (result != cudaSuccess) {
throw std::runtime_error("cudaSetDevice() failed.");
}
library::SymmDescription const &operation_desc =
static_cast<library::SymmDescription const &>(operation->description());
if (options.execution_mode != ExecutionMode::kDryRun) {
int seed_shift = 0;
if (operation_desc.side_mode == SideMode::kLeft) {
symm_workspace_.A = device_context.allocate_tensor(
symm_workspace_.A = device_context.allocate_and_initialize_tensor(
options,
"A",
operation_desc.A.element,
@ -425,10 +436,11 @@ Status SymmOperationProfiler::initialize_workspace(
{int(problem_.m), int(problem_.m)},
{int(problem_.lda)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
} else if (operation_desc.side_mode == SideMode::kRight) {
symm_workspace_.A = device_context.allocate_tensor(
symm_workspace_.A = device_context.allocate_and_initialize_tensor(
options,
"A",
operation_desc.A.element,
@ -436,11 +448,12 @@ Status SymmOperationProfiler::initialize_workspace(
{int(problem_.n), int(problem_.n)},
{int(problem_.lda)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
}
symm_workspace_.B = device_context.allocate_tensor(
symm_workspace_.B = device_context.allocate_and_initialize_tensor(
options,
"B",
operation_desc.B.element,
@ -448,10 +461,11 @@ Status SymmOperationProfiler::initialize_workspace(
{int(problem_.m), int(problem_.n)},
{int(problem_.ldb)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
symm_workspace_.C = device_context.allocate_tensor(
symm_workspace_.C = device_context.allocate_and_initialize_tensor(
options,
"C",
operation_desc.C.element,
@ -459,23 +473,30 @@ Status SymmOperationProfiler::initialize_workspace(
{int(problem_.m), int(problem_.n)},
{int(problem_.ldc)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
symm_workspace_.Computed = device_context.allocate_tensor(
options,
"D",
operation_desc.C.element,
operation_desc.C.layout,
{int(problem_.m), int(problem_.n)},
{int(problem_.ldc)}
{int(problem_.ldc)},
1, // batch_count
0 // device_index
);
symm_workspace_.Reference = device_context.allocate_tensor(
options,
"Reference",
operation_desc.C.element,
operation_desc.C.layout,
{int(problem_.m), int(problem_.n)},
{int(problem_.ldc)}
{int(problem_.ldc)},
1, // batch_count
0 // device_index
);
symm_workspace_.Computed->copy_from_device(symm_workspace_.C->data());
@ -524,7 +545,7 @@ Status SymmOperationProfiler::initialize_workspace(
/// Verifies CUTLASS against references
bool SymmOperationProfiler::verify_cutlass(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -553,7 +574,7 @@ bool SymmOperationProfiler::verify_cutlass(
//
results_.back().status = operation->run(
&symm_workspace_.arguments,
&symm_workspace_.arguments,
symm_workspace_.host_workspace.data(),
symm_workspace_.device_workspace.data());
@ -601,8 +622,8 @@ bool SymmOperationProfiler::verify_cutlass(
}
}
#endif // #if CUTLASS_ENABLE_CUBLAS
// Update disposition to worst case verification outcome among all
// Update disposition to worst case verification outcome among all
// verification providers which are supported
bool is_any_verification_run_passed = false;
for(auto &m : results_.back().verification_map) {
@ -628,7 +649,7 @@ bool SymmOperationProfiler::verify_cutlass(
/// Verifies CUTLASS against references
bool SymmOperationProfiler::verify_with_cublas_(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -638,13 +659,13 @@ bool SymmOperationProfiler::verify_with_cublas_(
#if CUTLASS_ENABLE_CUBLAS
library::SymmDescription const &symm_desc =
library::SymmDescription const &symm_desc =
static_cast<library::SymmDescription const &>(operation->description());
//
// Construct cuBLAS operators
//
CublasCreate handle;
cublasStatus_t status = handle.get_cublas_create_status();
@ -673,8 +694,8 @@ bool SymmOperationProfiler::verify_with_cublas_(
symm_workspace_.arguments.beta = problem_.beta.data();
symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
detail::cublasSymmDispatcher symm_op(
symm_desc,
detail::cublasSymmDispatcher symm_op(
symm_desc,
symm_workspace_.configuration,
symm_workspace_.arguments
);
@ -706,7 +727,7 @@ bool SymmOperationProfiler::verify_with_cublas_(
);
// Save workspace if incorrect
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
save_workspace(
@ -731,7 +752,7 @@ bool SymmOperationProfiler::verify_with_cublas_(
/// Measures performance results
bool SymmOperationProfiler::profile(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,

View File

@ -31,7 +31,7 @@
/* \file
\brief Execution environment
*/
#include <iostream>
@ -54,7 +54,7 @@ namespace profiler {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Ctor
TrmmOperationProfiler::TrmmOperationProfiler(Options const &options):
TrmmOperationProfiler::TrmmOperationProfiler(Options const &options):
OperationProfiler(
options,
library::OperationKind::kTrmm,
@ -113,7 +113,7 @@ void TrmmOperationProfiler::print_examples(std::ostream &out) const {
<< "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
<< " $ cutlass_profiler --operation=Trmm --cta_m=256 --cta_n=128 --cta_k=32 --save-workspace=incorrect\n\n"
<< "Test your changes to trmm kernels with a quick functional test and save results in functional-test.csv:\n"
<< " $ cutlass_profiler --operation=Trmm \\ \n"
<< " --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
@ -143,22 +143,22 @@ Status TrmmOperationProfiler::TrmmProblem::parse(
library::TrmmDescription const &operation_desc,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
if (!arg_as_int(this->m, "m", problem_space, problem)) {
// default value
this->m = 1024;
}
if (!arg_as_int(this->n, "n", problem_space, problem)) {
// default value
this->n = 1024;
}
if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
// default value
this->split_k_slices = 1;
}
if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
// default value
this->batch_count = 1;
@ -182,29 +182,29 @@ Status TrmmOperationProfiler::TrmmProblem::parse(
}
if (!arg_as_scalar(
this->alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
this->alpha,
operation_desc.element_epilogue,
"alpha",
problem_space,
problem)) {
if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
return Status::kErrorInternal;
}
}
if (!arg_as_scalar(
this->beta,
operation_desc.element_epilogue,
"beta",
problem_space,
this->beta,
operation_desc.element_epilogue,
"beta",
problem_space,
problem)) {
if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
return Status::kErrorInternal;
}
}
if (operation_desc.side_mode == SideMode::kLeft) {
this->lda = DeviceAllocation::get_packed_layout(
operation_desc.A.layout, {int(this->m), int(this->m)}).front();
@ -265,14 +265,14 @@ void TrmmOperationProfiler::TrmmProblem::initialize_result(
/// Extracts the problem dimensions
Status TrmmOperationProfiler::initialize_configuration(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::TrmmDescription const &operation_desc =
library::TrmmDescription const &operation_desc =
static_cast<library::TrmmDescription const &>(operation->description());
if (operation_desc.trmm_kind != library::TrmmKind::kUniversal) {
@ -280,14 +280,14 @@ Status TrmmOperationProfiler::initialize_configuration(
}
Status status = problem_.parse(operation_desc, problem_space, problem);
if (status != Status::kSuccess) {
return status;
}
trmm_workspace_.configuration.problem_size.m() = int(problem_.m);
trmm_workspace_.configuration.problem_size.n() = int(problem_.n);
trmm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft)
trmm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft)
? int(problem_.m) : int(problem_.n);
trmm_workspace_.configuration.lda = problem_.lda;
trmm_workspace_.configuration.ldb = problem_.ldb;
@ -303,14 +303,14 @@ Status TrmmOperationProfiler::initialize_configuration(
trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
initialize_result_(this->model_result_, options, operation_desc, problem_space);
return operation->can_implement(&trmm_workspace_.configuration, &trmm_workspace_.arguments);
}
/// Initializes the performance result
void TrmmOperationProfiler::initialize_result_(
PerformanceResult &result,
Options const &options,
Options const &options,
library::TrmmDescription const &operation_desc,
ProblemSpace const &problem_space) {
@ -318,30 +318,30 @@ void TrmmOperationProfiler::initialize_result_(
result.disposition = Disposition::kNotRun;
result.status = Status::kSuccess;
result.operation_name = operation_desc.name;
problem_.initialize_result(result, operation_desc, problem_space);
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
if (operation_desc.side_mode == SideMode::kLeft) {
// Input bytes read and Output bytes written for the trmm problem
result.bytes =
result.bytes =
// Half matrix including the diagonal will have (M*(M+1))/2 elements
int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.m / 8) * (problem_.m + 1) / 2 +
int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n +
int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n +
int64_t(library::sizeof_bits(operation_desc.D.element) * problem_.m / 8) * problem_.n;
} else if (operation_desc.side_mode == SideMode::kRight) {
// Input bytes read and Output bytes written for the trmm problem
result.bytes =
result.bytes =
// Half matrix including the diagonal will have (N*(N+1))/2 elements
int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.n / 8) * (problem_.n + 1) / 2 +
int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n +
int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n +
int64_t(library::sizeof_bits(operation_desc.D.element) * problem_.m / 8) * problem_.n;
}
// FLOPs = 2 * [ ( M * (M+1)/2 * N ) ] // Beta is zero
result.flops = problem_.m * (problem_.m + 1) * problem_.n;
result.runtime = 0;
// complex-valued support
@ -349,11 +349,11 @@ void TrmmOperationProfiler::initialize_result_(
case library::MathOperationID::kMultiplyAddComplex:
result.flops *= 4;
break;
case library::MathOperationID::kMultiplyAddComplexFastF32:
result.flops *= 4;
break;
default: break;
}
@ -361,20 +361,31 @@ void TrmmOperationProfiler::initialize_result_(
/// Initializes workspace
Status TrmmOperationProfiler::initialize_workspace(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
library::TrmmDescription const &operation_desc =
if (options.device.devices.size() != 1) {
throw std::runtime_error("This operation profiler only supports a single "
"device.");
}
cudaError_t result;
result = cudaSetDevice(options.device.device_id(0));
if (result != cudaSuccess) {
throw std::runtime_error("cudaSetDevice() failed.");
}
library::TrmmDescription const &operation_desc =
static_cast<library::TrmmDescription const &>(operation->description());
if (options.execution_mode != ExecutionMode::kDryRun) {
int seed_shift = 0;
if (operation_desc.side_mode == SideMode::kLeft) {
trmm_workspace_.A = device_context.allocate_tensor(
trmm_workspace_.A = device_context.allocate_and_initialize_tensor(
options,
"A",
operation_desc.A.element,
@ -382,10 +393,11 @@ Status TrmmOperationProfiler::initialize_workspace(
{int(problem_.m), int(problem_.m)},
{int(problem_.lda)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
} else if (operation_desc.side_mode == SideMode::kRight) {
trmm_workspace_.A = device_context.allocate_tensor(
trmm_workspace_.A = device_context.allocate_and_initialize_tensor(
options,
"A",
operation_desc.A.element,
@ -393,11 +405,12 @@ Status TrmmOperationProfiler::initialize_workspace(
{int(problem_.n), int(problem_.n)},
{int(problem_.lda)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
}
trmm_workspace_.B = device_context.allocate_tensor(
trmm_workspace_.B = device_context.allocate_and_initialize_tensor(
options,
"B",
operation_desc.B.element,
@ -405,23 +418,30 @@ Status TrmmOperationProfiler::initialize_workspace(
{int(problem_.m), int(problem_.n)},
{int(problem_.ldb)},
1, // batch_count
seed_shift++
seed_shift++,
0 // device_index
);
trmm_workspace_.Computed = device_context.allocate_tensor(
options,
"D",
operation_desc.D.element,
operation_desc.D.layout,
{int(problem_.m), int(problem_.n)},
{int(problem_.ldd)}
{int(problem_.ldd)},
1, // batch_count
0 // device_index
);
trmm_workspace_.Reference = device_context.allocate_tensor(
options,
"Reference",
operation_desc.D.element,
operation_desc.D.layout,
{int(problem_.m), int(problem_.n)},
{int(problem_.ldd)}
{int(problem_.ldd)},
1, // batch_count
0 // device_index
);
}
@ -467,7 +487,7 @@ Status TrmmOperationProfiler::initialize_workspace(
/// Verifies CUTLASS against references
bool TrmmOperationProfiler::verify_cutlass(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -495,7 +515,7 @@ bool TrmmOperationProfiler::verify_cutlass(
//
results_.back().status = operation->run(
&trmm_workspace_.arguments,
&trmm_workspace_.arguments,
trmm_workspace_.host_workspace.data(),
trmm_workspace_.device_workspace.data());
@ -543,8 +563,8 @@ bool TrmmOperationProfiler::verify_cutlass(
}
}
#endif // #if CUTLASS_ENABLE_CUBLAS
// Update disposition to worst case verification outcome among all
// Update disposition to worst case verification outcome among all
// verification providers which are supported
bool is_any_verification_run_passed = false;
for(auto &m : results_.back().verification_map) {
@ -570,7 +590,7 @@ bool TrmmOperationProfiler::verify_cutlass(
/// Verifies CUTLASS against references
bool TrmmOperationProfiler::verify_with_cublas_(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,
@ -580,13 +600,13 @@ bool TrmmOperationProfiler::verify_with_cublas_(
#if CUTLASS_ENABLE_CUBLAS
library::TrmmDescription const &trmm_desc =
library::TrmmDescription const &trmm_desc =
static_cast<library::TrmmDescription const &>(operation->description());
//
// Construct cuBLAS operators
//
CublasCreate handle;
cublasStatus_t status = handle.get_cublas_create_status();
@ -614,8 +634,8 @@ bool TrmmOperationProfiler::verify_with_cublas_(
trmm_workspace_.arguments.beta = problem_.beta.data();
trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
detail::cublasTrmmDispatcher trmm_op(
trmm_desc,
detail::cublasTrmmDispatcher trmm_op(
trmm_desc,
trmm_workspace_.configuration,
trmm_workspace_.arguments
);
@ -646,7 +666,7 @@ bool TrmmOperationProfiler::verify_with_cublas_(
);
// Save workspace if incorrect
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
save_workspace(
@ -671,7 +691,7 @@ bool TrmmOperationProfiler::verify_with_cublas_(
/// Measures performance results
bool TrmmOperationProfiler::profile(
Options const &options,
Options const &options,
PerformanceReport &report,
DeviceContext &device_context,
library::Operation const *operation,

View File

@ -37,9 +37,11 @@
*/
#include <memory>
#include <sstream>
#include "cutlass/platform/platform.h"
#include "cutlass/numeric_types.h"
#include "cutlass/trace.h"
#include "exceptions.h"
namespace cutlass {
@ -61,8 +63,20 @@ T* allocate(size_t count = 1) {
cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
if (cuda_error != cudaSuccess) {
#if (CUTLASS_DEBUG_TRACE_LEVEL > 0)
std::ostringstream os;
os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes=" << bytes;
CUTLASS_TRACE_HOST(os.str());
#endif
throw cuda_exception("Failed to allocate memory", cuda_error);
}
#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
else {
std::ostringstream os;
os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes=" << bytes;
CUTLASS_TRACE_HOST(os.str());
}
#endif
return ptr;
}
@ -85,11 +99,36 @@ void free(T* ptr) {
template <typename T>
void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
size_t bytes = count * sizeof_bits<T>::value / 8;
if (bytes == 0 && count > 0)
if (bytes == 0 && count > 0) {
bytes = 1;
}
cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
if (cuda_error != cudaSuccess) {
throw cuda_exception("cudaMemcpy() failed", cuda_error);
std::ostringstream os;
os << "cutlass::device_memory::copy: cudaMemcpy() failed: "
<< "dst=" << dst << ", src=" << src
<< ", bytes=" << bytes << ", count=" << count;
if (kind == cudaMemcpyHostToDevice) {
os << ", kind=cudaMemcpyHostToDevice";
}
else if (kind == cudaMemcpyDeviceToHost) {
os << ", kind=cudaMemcpyDeviceToHost";
}
else if (kind == cudaMemcpyDeviceToDevice) {
os << ", kind=cudaMemcpyDeviceToDevice";
}
else if (kind == cudaMemcpyHostToHost) {
os << ", kind=cudaMemcpyHostToHost";
}
else if (kind == cudaMemcpyDefault) {
os << ", kind=cudaMemcpyDefault";
}
else {
os << ", kind=Unknown";
}
os << ", error: " << cudaGetErrorString(cuda_error);
throw cuda_exception(os.str().c_str(), cuda_error);
}
}

View File

@ -51,6 +51,8 @@ struct Distribution {
struct {
double min;
double max;
// Percent elements set to NaN
double pnan;
} uniform;
/// Gaussian distribution
@ -82,17 +84,18 @@ struct Distribution {
Distribution() : kind(Invalid), int_scale(0) {}
/// Configures distribution as uniform random
Distribution &set_uniform(double _min, double _max, int _int_scale = 0) {
/// Configures distribution as uniform random
Distribution &set_uniform(double _min, double _max, int _int_scale = 0, double _pnan = 0) {
kind = Uniform;
uniform.min = _min;
uniform.max = _max;
int_scale = _int_scale;
uniform.pnan = _pnan;
return *this;
}
/// Configures distribution as Gaussian distribution
Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0, double _pnz = 100.0) {
Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0, double _pnz = 1.0) {
kind = Gaussian;
gaussian.mean = _mean;
gaussian.stddev = _stddev;
@ -125,7 +128,8 @@ struct Distribution {
inline std::ostream &operator<<(std::ostream &out, cutlass::Distribution const &dist) {
switch (dist.kind) {
case cutlass::Distribution::Uniform:
out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max;
out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max
<< ", pnan: " << dist.uniform.pnan;
break;
case cutlass::Distribution::Gaussian:
out << "gaussian, mean: " << dist.gaussian.mean << ", stddev: " << dist.gaussian.stddev

View File

@ -177,16 +177,25 @@ public:
void reserve(
size_t count, ///< size of tensor in elements
bool device_backed_ = true) { ///< if true, device memory is also allocated
#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count=" << count << ", device_backed_=" << (device_backed_ ? "true" : "false") << ")");
#endif
device_.reset();
host_.clear();
size_t count_container = count_to_container_storage_unit_count(count);
#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize(" << count_container << ")");
#endif
host_.resize(count_container);
// Allocate memory
StorageUnit* device_memory = nullptr;
if (device_backed_) {
#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: device_memory::allocate(" << count_container << ")");
#endif
device_memory = device_memory::allocate<StorageUnit>(count_container);
}
device_.reset(device_memory, device_backed_ ? count_container : 0);
@ -394,7 +403,7 @@ public:
void sync_device() {
if (device_backed()) {
device_memory::copy_to_device(
device_.get(), host_.data(), host_.capacity());
device_.get(), host_.data(), host_.size());
}
}

View File

@ -35,6 +35,8 @@
#pragma once
#include "cute/layout.hpp"
#include "cute/container/array.hpp" // cute::array
#include "cutlass/conv/convolution.h" // cutlass::conv::Operator
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -57,6 +57,7 @@
#include "cutlass/complex.h"
#include "cutlass/tensor_view.h"
#include "cutlass/blas3.h"
#include "cutlass/numeric_types.h"
#include "cutlass/layout/vector.h"
@ -117,6 +118,7 @@ struct RandomGaussianFunc {
int int_scale;
FloatType float_scale_up;
FloatType float_scale_down;
int exclude_zero; ///< If non-negative, excludes zeros
//
// Methods
@ -127,12 +129,14 @@ struct RandomGaussianFunc {
uint64_t seed_ = 0,
Element mean_ = 0,
Element stddev_ = 1,
int int_scale_ = -1
int int_scale_ = -1,
int exclude_zero_ = -1
):
seed(seed_),
mean(static_cast<FloatType>(mean_)),
stddev(static_cast<FloatType>(stddev_)),
int_scale(int_scale_) {
int_scale(int_scale_),
exclude_zero(exclude_zero_) {
float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
@ -178,6 +182,15 @@ struct RandomGaussianFunc {
result = Element(rnd);
}
if (params.exclude_zero >=0 && result == Element(0.0)) {
if (rnd > FloatType(0)) {
rnd += FloatType(1);
} else {
rnd -= FloatType(1);
}
result = Element(rnd);
}
return result;
}
};
@ -203,6 +216,7 @@ struct RandomGaussianFunc<complex<Real>> {
int int_scale;
FloatType float_scale_up;
FloatType float_scale_down;
int exclude_zero; ///< If non-negative, excludes zeros
//
// Methods
@ -213,12 +227,14 @@ struct RandomGaussianFunc<complex<Real>> {
uint64_t seed_ = 0,
Real mean_ = 0,
Real stddev_ = 1,
int int_scale_ = -1
int int_scale_ = -1,
int exclude_zero_ = -1
):
seed(seed_),
mean(static_cast<FloatType>(mean_)),
stddev(static_cast<FloatType>(stddev_)),
int_scale(int_scale_) {
int_scale(int_scale_),
exclude_zero(exclude_zero_) {
float_scale_up = FloatType(IntType(1) << int_scale);
float_scale_up += FloatType(0.5) * float_scale_up;
@ -272,6 +288,18 @@ struct RandomGaussianFunc<complex<Real>> {
result = Element(Real(rnd_r), Real(rnd_i));
}
if (params.exclude_zero >= 0 &&
result.real() == Real(0.0) &&
result.imag() == Real(0.0)) {
if (rnd_r > FloatType(0)) {
rnd_r += FloatType(1);
} else {
rnd_r -= FloatType(1);
}
result = Element(Real(rnd_r), Real(rnd_i));
}
return result;
}
};
@ -358,6 +386,7 @@ void TensorFillRandomGaussian(
int bits = -1, ///< If non-negative, specifies number of fractional bits that
/// are not truncated to zero. Permits reducing precision of
/// data.
int exclude_zero = -1, ///< If non-negative, excludes zeros from tensor init
cudaStream_t stream = nullptr) {
using RandomFunc = detail::RandomGaussianFunc<Element>;
@ -366,7 +395,7 @@ void TensorFillRandomGaussian(
TensorForEach<Func, Layout::kRank, Params>(
view.extent(),
Params(view, typename RandomFunc::Params(seed, mean, stddev, bits)),
Params(view, typename RandomFunc::Params(seed, mean, stddev, bits, exclude_zero)),
/*grid_size*/0, /*block_size*/0,
stream
);
@ -399,7 +428,7 @@ void BlockFillRandomGaussian(
namespace detail {
/// Computes a random Gaussian distribution
/// Computes a random uniform distribution
template <typename Element> ///< Element type
struct RandomUniformFunc {
@ -424,8 +453,10 @@ struct RandomUniformFunc {
FloatType range;
FloatType max;
int int_scale;
double pnan;
FloatType float_scale_up;
FloatType float_scale_down;
int exclude_zero; ///< If non-negative, excludes zeros
/// Default ctor
CUTLASS_HOST_DEVICE
@ -440,15 +471,25 @@ struct RandomUniformFunc {
uint64_t seed_ = 0,
Element max_ = 1,
Element min = 0,
int int_scale_ = -1
int int_scale_ = -1,
double pnan_ = 0,
int exclude_zero_ = -1
):
seed(seed_),
range(static_cast<FloatType>(max_) - static_cast<FloatType>(min)),
max(static_cast<FloatType>(max_)),
int_scale(int_scale_) {
int_scale(int_scale_),
pnan(pnan_),
exclude_zero(exclude_zero_) {
float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
// Handle cases where min = 0 or max = 0 for excluding zeros
if (exclude_zero >= 0) {
range = (min == Element(0)) ? range - FloatType(1): range;
max = (max_ == Element(0)) ? max - FloatType(1): max;
}
}
};
@ -479,6 +520,13 @@ struct RandomUniformFunc {
CUTLASS_DEVICE
Element operator()() {
// Draw random float in [0.0, 1.0] to determine if element should be NaN.
if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
return Element(NAN);
}
}
FloatType rnd = random_uniform_float<FloatType>(&rng_state);
rnd = params.max - params.range * rnd;
@ -494,6 +542,15 @@ struct RandomUniformFunc {
result = Element(rnd);
}
if (params.exclude_zero >=0 && result == Element(0.0)) {
if (rnd > FloatType(0)) {
rnd = std::min(params.max, rnd + FloatType(1));
} else {
rnd = std::max((params.max - params.range), rnd - FloatType(1));
}
result = Element(rnd);
}
return result;
}
};
@ -525,8 +582,10 @@ struct RandomUniformFunc<complex<Real>> {
FloatType range;
FloatType min;
int int_scale;
double pnan;
FloatType float_scale_up;
FloatType float_scale_down;
int exclude_zero; ///< If non-negative, excludes zeros
/// Default ctor
CUTLASS_HOST_DEVICE
@ -541,16 +600,26 @@ struct RandomUniformFunc<complex<Real>> {
uint64_t seed_ = 0,
FloatType max = 1,
FloatType min_ = 0,
int int_scale_ = -1
int int_scale_ = -1,
double pnan_ = 0,
int exclude_zero_ = -1
):
seed(seed_),
range(static_cast<FloatType>(max - min_)),
min(static_cast<FloatType>(min_)),
int_scale(int_scale_) {
int_scale(int_scale_),
pnan(pnan_),
exclude_zero(exclude_zero_) {
float_scale_up = FloatType(IntType(1) << int_scale);
float_scale_up += FloatType(0.5) * float_scale_up;
float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
// Handle cases where min = 0 or max = 0 for excluding zeros
if (exclude_zero >= 0) {
min = (min == FloatType(0)) ? min + FloatType(1): min;
range = (max == FloatType(0)) ? range - FloatType(1): range;
}
}
};
@ -581,6 +650,13 @@ struct RandomUniformFunc<complex<Real>> {
CUTLASS_DEVICE
Element operator()() {
// Draw random float in [0.0, 1.0] to determine if element should be NaN.
if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
return Element(Real(NAN), Real(NAN));
}
}
FloatType rnd_r = random_uniform_float<FloatType>(&rng_state);
FloatType rnd_i = random_uniform_float<FloatType>(&rng_state);
@ -604,11 +680,23 @@ struct RandomUniformFunc<complex<Real>> {
result = Element(Real(rnd_r), Real(rnd_i));
}
if (params.exclude_zero >= 0 &&
result.real() == Real(0.0) &&
result.imag() == Real(0.0)) {
if (rnd_r > FloatType(0)) {
rnd_r = std::min(params.min + params.range, rnd_r + FloatType(1));
} else {
rnd_r = std::max((params.min), rnd_r - FloatType(1));
}
result = Element(Real(rnd_r), Real(rnd_i));
}
return result;
}
};
/// Computes a random Gaussian distribution
/// Computes a random uniform distribution
template <
typename Element, ///< Element type
typename Layout> ///< Layout function
@ -693,13 +781,15 @@ void TensorFillRandomUniform(
int bits = -1, ///< If non-negative, specifies number of fractional bits that
/// are not truncated to zero. Permits reducing precision of
/// data.
double pnan = 0, ///< Percentage of NaN elements.
int exclude_zero = -1, ///< If non-negative, excludes zeros from tensor init
cudaStream_t stream = nullptr) {
using RandomFunc = detail::RandomUniformFunc<Element>;
using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
using Params = typename Func::Params;
typename RandomFunc::Params random(seed, max, min, bits);
typename RandomFunc::Params random(seed, max, min, bits, pnan, exclude_zero);
TensorForEach<Func, Layout::kRank, Params>(
view.extent(),
@ -722,11 +812,12 @@ void BlockFillRandomUniform(
int bits = -1, ///< If non-negative, specifies number of fractional bits that
/// are not truncated to zero. Permits reducing precision of
/// data.
double pnan = 0, ///< Percentage of NaN elements.
cudaStream_t stream = nullptr) {
using RandomFunc = detail::RandomUniformFunc<Element>;
typename RandomFunc::Params params(seed, max, min, bits);
typename RandomFunc::Params params(seed, max, min, bits, pnan);
BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
}
@ -1672,7 +1763,11 @@ void TensorFillRandom(
TensorView<Element, Layout> view, ///< destination tensor
uint64_t seed,
Distribution dist,
cudaStream_t stream = nullptr) {
cudaStream_t stream = nullptr,
int exclude_zero = -1 ///< If non-negative, excludes 0.
/// Note that setting this flag will result in more 1's,
/// as we use a simple mechanism to replace 0's by adding/subtracting 1's.
) {
using Real = typename RealType<Element>::Type;
@ -1683,6 +1778,7 @@ void TensorFillRandom(
static_cast<Real>(dist.gaussian.mean),
static_cast<Real>(dist.gaussian.stddev),
dist.int_scale,
exclude_zero,
stream);
} else if (dist.kind == Distribution::Uniform) {
TensorFillRandomUniform<Element, Layout>(
@ -1691,6 +1787,8 @@ void TensorFillRandom(
static_cast<Real>(dist.uniform.max),
static_cast<Real>(dist.uniform.min),
dist.int_scale,
dist.uniform.pnan,
exclude_zero,
stream);
}
}
@ -1753,6 +1851,7 @@ void BlockFillRandom(
static_cast<Real>(dist.uniform.max),
static_cast<Real>(dist.uniform.min),
dist.int_scale,
dist.uniform.pnan,
stream);
}
}

View File

@ -128,7 +128,8 @@ template<
class EpilogueFusionParams
>
struct ConvReferenceImpl {
using ElementAcc = typename EpilogueFusionParams::ElementAcc;
// Hard code accumlulator type to float to avoid data lost in accumulating add.
using ElementAcc = cutlass::platform::conditional_t<cutlass::platform::is_same_v<typename EpilogueFusionParams::ElementAcc, double>, double, float>;
using ElementC = typename EpilogueFusionParams::ElementC;
using ElementOut = typename EpilogueFusionParams::ElementOut;
using ElementScalar = typename EpilogueFusionParams::ElementScalar;

View File

@ -342,7 +342,8 @@ void gett_epilogue(
ElementCompute converted_acc = accumulator_converter(acc[m_b][n_b]);
// per-row alpha
if (raw_pointer_cast(epilogue_params.Valpha.data())) {
converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b));
converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b, n + n_b, l));
converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
}
ElementCompute output = mul(converted_alpha, converted_acc);
@ -355,7 +356,8 @@ void gett_epilogue(
ElementCompute converted_src = source_converter(epilogue_params.C(m + m_b, n + n_b, l));
// per-row beta
if (epilogue_params.Vbeta.data()) {
converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b));
converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b, n + n_b, l));
converted_beta = mul(converted_beta, converted_scale_c);
}
output = epilogue_fma(converted_beta, converted_src, output);
}

View File

@ -159,6 +159,7 @@ struct RandomGaussianFunc {
int int_scale;
double pi;
double pnz;
bool exclude_zero;
//
// Methods
@ -168,9 +169,10 @@ struct RandomGaussianFunc {
double mean_ = 0,
double stddev_ = 1,
int int_scale_ = -1,
double pnz_ = 100.0
double pnz_ = 1.0,
bool exclude_zero_ = false
):
seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_) {
seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
std::srand((unsigned)seed);
}
@ -191,7 +193,7 @@ struct RandomGaussianFunc {
// Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
std::random_device rnd_device;
std::mt19937 bernoulli_rnd(rnd_device());
std::bernoulli_distribution bernoulli_dist(pnz / 100);
std::bernoulli_distribution bernoulli_dist(pnz);
bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
// Sample from the Gaussian distribution for a nonzero element
@ -208,6 +210,16 @@ struct RandomGaussianFunc {
result = static_cast<Element>(0);
}
// Note that exclude_zero = true will disable the bernoulli_result above by unsetting zeros
if (exclude_zero && result == Element(0)) {
if (rnd > 0) {
rnd += 1;
} else {
rnd -= 1;
}
result = Element(rnd);
}
return result;
}
};
@ -222,6 +234,7 @@ struct RandomGaussianFunc<complex<Element> > {
int int_scale;
double pi;
double pnz;
bool exclude_zero;
//
// Methods
@ -231,9 +244,10 @@ struct RandomGaussianFunc<complex<Element> > {
double mean_ = 0,
double stddev_ = 1,
int int_scale_ = -1,
double pnz_ = 100.0
double pnz_ = 1.0,
bool exclude_zero_ = false
):
seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_) {
seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
std::srand((unsigned)seed);
}
@ -249,7 +263,7 @@ struct RandomGaussianFunc<complex<Element> > {
// Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
std::random_device rnd_device;
std::mt19937 bernoulli_rnd(rnd_device());
std::bernoulli_distribution bernoulli_dist(pnz / 100);
std::bernoulli_distribution bernoulli_dist(pnz);
bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
// Sample from the Gaussian distribution for a nonzero element
@ -270,6 +284,19 @@ struct RandomGaussianFunc<complex<Element> > {
reals[1] = from_real<Element>(0);
}
// Note that this will invalidate the above else statement because it unsets zero elements
if (exclude_zero &&
reals[0] == from_real<Element>(0.0) &&
reals[1] == from_real<Element>(0.0)) {
if (rnd[0] > 0.0) {
rnd[0] += 1.0;
} else {
rnd[0] -= 1.0;
}
reals[0] = from_real<Element>(rnd[0]);
}
return complex<Element>(reals[0], reals[1]);
}
};
@ -284,6 +311,7 @@ struct RandomGaussianFunc<Quaternion<Element> > {
int int_scale;
double pi;
double pnz;
bool exclude_zero;
//
// Methods
@ -293,9 +321,10 @@ struct RandomGaussianFunc<Quaternion<Element> > {
double mean_ = 0,
double stddev_ = 1,
int int_scale_ = -1,
double pnz_ = 100.0
double pnz_ = 1.0,
bool exclude_zero_ = false
):
seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_) {
seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
std::srand((unsigned)seed);
}
@ -313,7 +342,7 @@ struct RandomGaussianFunc<Quaternion<Element> > {
// Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
std::random_device rnd_device;
std::mt19937 bernoulli_rnd(rnd_device());
std::bernoulli_distribution bernoulli_dist(pnz / 100);
std::bernoulli_distribution bernoulli_dist(pnz);
bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
// Sample from the Gaussian distribution for a nonzero element
@ -343,6 +372,21 @@ struct RandomGaussianFunc<Quaternion<Element> > {
reals[3] = from_real<Element>(0);
}
// Note that this will invalidate the above else statement because it unsets zero elements
if (exclude_zero &&
reals[0] == from_real<Element>(0) &&
reals[1] == from_real<Element>(0) &&
reals[2] == from_real<Element>(0) &&
reals[3] == from_real<Element>(0)) {
if (rnd1[0] > 0.0) {
rnd1[0] += 1.0;
} else {
rnd1[0] -= 1.0;
}
reals[0] = from_real<Element>(rnd1[0]);
}
return Quaternion<Element>(reals[0], reals[1], reals[2], reals[3]);
}
};
@ -440,10 +484,11 @@ void TensorFillRandomGaussian(
double mean = 0, ///< Gaussian distribution's mean
double stddev = 1, ///< Gaussian distribution's standard deviation
int bits = -1, ///< If non-negative, specifies number of fractional bits that
double pnz = 100.0) { /// are not truncated to zero. Permits reducing precision of
double pnz = 1.0, /// are not truncated to zero. Permits reducing precision of
/// data.
bool exclude_zero = false) { ///< Exclude zeros from tensor init.
detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz, exclude_zero);
detail::TensorFillGaussianFunc<Element, Layout> func(
dst,
@ -466,8 +511,9 @@ void TensorFillRandomGaussian(
double mean = 0, ///< Gaussian distribution's mean
double stddev = 1, ///< Gaussian distribution's standard deviation
int bits = -1, ///< If non-negative, specifies number of fractional bits that
double pnz = 100.0) { /// are not truncated to zero. Permits reducing precision of
double pnz = 1.0, /// are not truncated to zero. Permits reducing precision of
/// data.
bool exclude_zero = false) { ///< Exclude zeros from tensor init.
TensorFillRandomGaussian(dst.view_real(), seed, mean, stddev, bits, pnz);
TensorFillRandomGaussian(dst.view_imag(), ~seed, mean, stddev, bits, pnz);
@ -485,7 +531,7 @@ void TensorFillSymmetricRandomGaussian(
double mean = 0, ///< Gaussian distribution's mean
double stddev = 1, ///< Gaussian distribution's standard deviation
int bits = -1, ///< If non-negative, specifies number of fractional bits that
double pnz = 100.0) { /// are not truncated to zero. Permits reducing precision of
double pnz = 1.0) { /// are not truncated to zero. Permits reducing precision of
/// data.
detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
@ -515,7 +561,7 @@ void BlockFillRandomGaussian(
double mean = 0, ///< Gaussian distribution's mean
double stddev = 1, ///< Gaussian distribution's standard deviation
int bits = -1, ///< If non-negative, specifies number of fractional bits that
double pnz = 100.0) { /// are not truncated to zero. Permits reducing precision of
double pnz = 1.0) { /// are not truncated to zero. Permits reducing precision of
/// data.
@ -542,23 +588,47 @@ struct RandomUniformFunc {
double min;
int int_scale;
//
// Methods
//
double pnan;
private:
using engine_type = std::mt19937;
public:
engine_type bernoulli_rnd;
std::bernoulli_distribution bernoulli_dist;
bool exclude_zero;
RandomUniformFunc(
uint64_t seed_ = 0,
double max = 1,
double min_ = 0,
int int_scale_ = -1
int int_scale_ = -1,
double pnan_ = 0,
bool exclude_zero_ = false
):
seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
, bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
, bernoulli_dist(pnan_)
, exclude_zero(exclude_zero_)
{
std::srand((unsigned)seed);
}
// Handle cases where min = 0 or max = 0 for excluding zeros
if (exclude_zero) {
min = (min == 0.0) ? min + 1: min;
range = (max == 0.0) ? range - 1: range;
}
}
/// Compute random value and update RNG state
Element operator()() const {
Element operator()() {
// Sample from NaN distribution.
if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
return Element(NAN);
}
}
double rnd = double(std::rand()) / double(RAND_MAX);
@ -575,6 +645,15 @@ struct RandomUniformFunc {
result = static_cast<Element>(Real(rnd));
}
if (exclude_zero && result == Element(0)) {
if (rnd > 0.0) {
rnd = std::min(min + range, rnd + 1.0);
} else {
rnd = std::max(min, rnd - 1.0);
}
result = static_cast<Element>(Real(rnd));
}
return result;
}
};
@ -590,6 +669,15 @@ struct RandomUniformFunc<complex<Element> > {
double min;
int int_scale;
double pnan;
private:
using engine_type = std::mt19937;
public:
engine_type bernoulli_rnd;
std::bernoulli_distribution bernoulli_dist;
bool exclude_zero;
//
// Methods
//
@ -598,15 +686,33 @@ struct RandomUniformFunc<complex<Element> > {
uint64_t seed_ = 0,
double max = 1,
double min_ = 0,
int int_scale_ = -1
int int_scale_ = -1,
double pnan_ = 0,
bool exclude_zero_ = false
):
seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
, bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
, bernoulli_dist(pnan_)
, exclude_zero(exclude_zero_) {
std::srand((unsigned)seed);
}
// Handle cases where min = 0 or max = 0 for excluding zeros
if (exclude_zero) {
min = (min == 0.0) ? min + 1: min;
range = (max == 0.0) ? range - 1: range;
}
}
/// Compute random value and update RNG state
complex<Element> operator()() const {
complex<Element> operator()() {
// Sample from NaN distribution.
if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
return Element(NAN);
}
}
Element reals[2];
@ -625,6 +731,19 @@ struct RandomUniformFunc<complex<Element> > {
else {
reals[i] = from_real<Element>(Real(rnd));
}
if (exclude_zero &&
i == 0 &&
reals[0] == from_real<Element>(0.0)) {
if (rnd > 0.0) {
rnd = std::min(min + range, rnd + 1.0);
} else {
rnd = std::max(min, rnd - 1.0);
}
reals[0] = from_real<Element>(Real(rnd));
}
}
return complex<Element>(reals[0], reals[1]);
@ -642,6 +761,13 @@ struct RandomUniformFunc<Quaternion<Element> > {
double min;
int int_scale;
double pnan;
private:
using engine_type = std::mt19937;
public:
engine_type bernoulli_rnd;
std::bernoulli_distribution bernoulli_dist;
//
// Methods
//
@ -650,15 +776,26 @@ struct RandomUniformFunc<Quaternion<Element> > {
uint64_t seed_ = 0,
double max = 1,
double min_ = 0,
int int_scale_ = -1
int int_scale_ = -1,
double pnan_ = 0
):
seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
std::srand((unsigned)seed);
}
seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_),
bernoulli_rnd{static_cast<engine_type::result_type>(seed_)},
bernoulli_dist(pnan_)
{
std::srand((unsigned)seed);
}
/// Compute random value and update RNG state
Quaternion<Element> operator()() const {
Quaternion<Element> operator()() {
// Sample from NaN distribution.
if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
return Element(NAN);
}
}
Element reals[4];
@ -712,7 +849,7 @@ struct TensorFillRandomUniformFunc {
}
/// Compute random value and update RNG state
void operator()(Coord<Layout::kRank> const &coord) const {
void operator()(Coord<Layout::kRank> const &coord) {
view.at(coord) = func();
}
@ -749,7 +886,7 @@ struct TensorFillSymmetricRandomUniformFunc {
}
/// Compute random value and update RNG state
void operator()(Coord<Layout::kRank> const &coord) const {
void operator()(Coord<Layout::kRank> const &coord) {
// Fill half of matrix based on FillMode
if (Layout::kRank == 2 &&
fill_mode == cutlass::FillMode::kLower &&
@ -796,7 +933,7 @@ struct TensorFillPadDiagonalRandomUniformFunc {
}
/// Compute random value and update RNG state
void operator()(Coord<Layout::kRank> const &coord) const {
void operator()(Coord<Layout::kRank> const &coord) {
// Fill half of matrix based on FillMode
if (Layout::kRank == 2 &&
(fill_mode == cutlass::FillMode::kLower) &&
@ -825,10 +962,12 @@ void TensorFillRandomUniform(
uint64_t seed, ///< seed for RNG
double max = 1, ///< upper bound of distribution
double min = 0, ///< lower bound for distribution
int bits = -1) { ///< If non-negative, specifies number of fractional bits that
int bits = -1, ///< If non-negative, specifies number of fractional bits that
/// are not truncated to zero. Permits reducing precision of
/// data.
detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
/// data.
double pnan = 0, ///< Percentage of NaN elements.
bool exclude_zero = false) { ///< Exclude zero from tensor init
detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan, exclude_zero);
detail::TensorFillRandomUniformFunc<Element, Layout> func(
dst,
@ -850,12 +989,14 @@ void TensorFillRandomUniform(
uint64_t seed, ///< seed for RNG
double max = 1, ///< upper bound of distribution
double min = 0, ///< lower bound for distribution
int bits = -1) { ///< If non-negative, specifies number of fractional bits that
int bits = -1, ///< If non-negative, specifies number of fractional bits that
/// are not truncated to zero. Permits reducing precision of
/// data.
double pnan = 0, ///< Percentage of NaN elements.
bool exclude_zero = false) { ///< Exclude zero from tensor init
TensorFillRandomUniform(dst.view_real(), seed, max, min, bits);
TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits);
TensorFillRandomUniform(dst.view_real(), seed, max, min, bits, pnan, exclude_zero);
TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits, pnan, exclude_zero);
}
@ -972,10 +1113,11 @@ void BlockFillRandomUniform(
uint64_t seed, ///< seed for RNG
double max = 1, ///< upper bound of distribution
double min = 0, ///< lower bound for distribution
int bits = -1) { ///< If non-negative, specifies number of fractional bits that
int bits = -1, ///< If non-negative, specifies number of fractional bits that
/// are not truncated to zero. Permits reducing precision of
/// data.
detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
/// data.
double pnan = 0) { ///< Percentage of NaN elements.
detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan);
for (size_t i = 0; i < capacity; ++i) {
ReferenceFactory<Element>::get(ptr, i) = random_func();
@ -1259,7 +1401,11 @@ template <
void TensorFillRandom(
TensorView<Element, Layout> view, ///< destination tensor
uint64_t seed,
Distribution dist) {
Distribution dist,
bool exclude_zero = false ///< If true, excludes 0.
/// Note that setting this flag will result in more 1's,
/// as we use a simple mechanism to replace 0's by adding/subtracting 1's.
) {
using Real = typename RealType<Element>::Type;
@ -1269,14 +1415,18 @@ void TensorFillRandom(
seed,
dist.gaussian.mean,
dist.gaussian.stddev,
dist.int_scale);
dist.int_scale,
dist.gaussian.pnz,
exclude_zero);
} else if (dist.kind == Distribution::Uniform) {
TensorFillRandomUniform(
view,
seed,
dist.uniform.max,
dist.uniform.min,
dist.int_scale);
dist.int_scale,
dist.uniform.pnan,
exclude_zero);
}
}
@ -1354,7 +1504,8 @@ void BlockFillRandom(
seed,
dist.uniform.max,
dist.uniform.min,
dist.int_scale);
dist.int_scale,
dist.uniform.pnan);
}
}