CUTLASS 3.6.0 (#1850)
* v3.6 * update changelog * update readme * fix typo * fixing typos * hopper gemm with weight prefetch --------- Co-authored-by: yuzhai <yuzhai@nvidia.com> Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
This commit is contained in:
@ -221,7 +221,8 @@ cutlass_add_cutlass_library(
|
||||
|
||||
# files split for parallel compilation
|
||||
src/reference/gemm_int4.cu
|
||||
src/reference/gemm_int8_canonical.cu
|
||||
src/reference/gemm_s8_s8_s32.cu
|
||||
src/reference/gemm_u8_u8_s32.cu
|
||||
src/reference/gemm_int8_interleaved_32.cu
|
||||
src/reference/gemm_int8_interleaved_64.cu
|
||||
src/reference/gemm_e4m3a_e4m3out.cu
|
||||
@ -278,6 +279,7 @@ execute_process(
|
||||
--generator-target library
|
||||
--architectures "${CUTLASS_NVCC_ARCHS_ENABLED}"
|
||||
--kernels "${CUTLASS_LIBRARY_KERNELS}"
|
||||
--instantiation-level "${CUTLASS_LIBRARY_INSTANTIATION_LEVEL}"
|
||||
--ignore-kernels "${CUTLASS_LIBRARY_IGNORE_KERNELS}"
|
||||
--exclude-kernels "${CUTLASS_LIBRARY_EXCLUDE_KERNELS}"
|
||||
--kernel-filter-file "${KERNEL_FILTER_FILE}"
|
||||
|
||||
@ -113,6 +113,12 @@ template <> struct ArchMap<arch::Sm90, arch::OpClassTensorOp> {
|
||||
static int const kMax = 90;
|
||||
};
|
||||
|
||||
// Arch conditional sparse WGMMA
|
||||
template <> struct ArchMap<arch::Sm90, arch::OpClassSparseTensorOp> {
|
||||
static int const kMin = 90;
|
||||
static int const kMax = 90;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace library
|
||||
|
||||
@ -103,6 +103,17 @@ public:
|
||||
void *device_workspace = nullptr,
|
||||
cudaStream_t stream = nullptr) const = 0;
|
||||
|
||||
// Originally designed for metadata, but should be useful for FP8/6/4 too.
|
||||
virtual Status initialize_with_profiler_workspace(
|
||||
void const *configuration,
|
||||
void *host_workspace,
|
||||
void *device_workspace,
|
||||
uint8_t **profiler_workspace_ptrs,
|
||||
int problem_count,
|
||||
cudaStream_t stream = nullptr) {
|
||||
return Status::kErrorNotSupported;
|
||||
}
|
||||
|
||||
virtual Status run(
|
||||
void const *arguments,
|
||||
void *host_workspace,
|
||||
@ -290,7 +301,6 @@ struct GemmUniversalArguments {
|
||||
|
||||
// Needed for some 3.x kernels
|
||||
int sm_count{0};
|
||||
|
||||
library::RasterOrder raster_order{};
|
||||
int swizzle_size{1};
|
||||
};
|
||||
|
||||
@ -616,7 +616,7 @@ private:
|
||||
/* traversal_stride = */ {traversal_stride_h, traversal_stride_w},
|
||||
/* dilation = */ {dilation_h, dilation_w},
|
||||
num_groups);
|
||||
out_args.mainloop.problem_shape = problem_shape;
|
||||
out_args.problem_shape = problem_shape;
|
||||
|
||||
// ConvProblemShape's constructor sets its shape_C member.
|
||||
#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
|
||||
@ -788,7 +788,7 @@ private:
|
||||
/* traversal_stride = */ {traversal_stride_d, traversal_stride_h, traversal_stride_w},
|
||||
/* dilation = */ {dilation_d, dilation_h, dilation_w},
|
||||
num_groups);
|
||||
out_args.mainloop.problem_shape = problem_shape;
|
||||
out_args.problem_shape = problem_shape;
|
||||
|
||||
// ConvProblemShape's constructor sets its shape_C member.
|
||||
#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
|
||||
|
||||
@ -249,7 +249,6 @@ protected:
|
||||
|
||||
/* Query device SM count to pass onto the kernel as an argument, where needed */
|
||||
operator_args.hw_info.sm_count = arguments->sm_count;
|
||||
|
||||
if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
|
||||
operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
|
||||
}
|
||||
@ -282,17 +281,18 @@ public:
|
||||
static_cast<GemmUniversalArguments const *>(arguments_ptr);
|
||||
|
||||
OperatorArguments args;
|
||||
auto status = update_arguments_(args, arguments);
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// can_implement rules may need access to problem shape
|
||||
args.problem_shape = cute::make_shape(
|
||||
configuration->problem_size.m(),
|
||||
configuration->problem_size.n(),
|
||||
configuration->problem_size.k(),
|
||||
configuration->batch_count);
|
||||
|
||||
auto status = update_arguments_(args, arguments);
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
return Operator::can_implement(args);
|
||||
}
|
||||
|
||||
|
||||
@ -121,14 +121,14 @@ void initialize_gemm_reference_operations_fp_mixed_input(Manifest &manifest) {
|
||||
half_t,
|
||||
int8_t,
|
||||
half_t,
|
||||
float
|
||||
float
|
||||
>(manifest);
|
||||
|
||||
make_gemm_real_canonical_layouts<
|
||||
half_t,
|
||||
uint8_t,
|
||||
half_t,
|
||||
float
|
||||
float
|
||||
>(manifest);
|
||||
|
||||
// bfloat16_t mixed with 8-bit integer input
|
||||
|
||||
@ -54,6 +54,14 @@ void initialize_gemm_reference_operations_fp_other(Manifest &manifest) {
|
||||
half_t
|
||||
>(manifest);
|
||||
|
||||
make_gemm_real_canonical_layouts<
|
||||
half_t,
|
||||
half_t,
|
||||
float,
|
||||
half_t,
|
||||
half_t
|
||||
>(manifest);
|
||||
|
||||
make_gemm_real_canonical_layouts<
|
||||
double,
|
||||
double,
|
||||
|
||||
@ -73,7 +73,7 @@ void initialize_gemm_reference_operations_int_mixed_input(Manifest &manifest) {
|
||||
int32_t,
|
||||
NumericConverterClamp<int32_t, float>
|
||||
>(manifest);
|
||||
|
||||
|
||||
make_gemm_real_canonical_layouts<
|
||||
int4b_t,
|
||||
int8_t,
|
||||
@ -110,7 +110,7 @@ void initialize_gemm_reference_operations_int_mixed_input(Manifest &manifest) {
|
||||
int32_t,
|
||||
NumericConverterClamp<int32_t, float>
|
||||
>(manifest);
|
||||
|
||||
|
||||
make_gemm_real_canonical_layouts<
|
||||
int8_t,
|
||||
int4b_t,
|
||||
|
||||
146
tools/library/src/reference/gemm_s8_s8_s32.cu
Normal file
146
tools/library/src/reference/gemm_s8_s8_s32.cu
Normal file
@ -0,0 +1,146 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Instantiates GEMM reference implementations.
|
||||
*/
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/library/library.h"
|
||||
#include "cutlass/library/manifest.h"
|
||||
|
||||
#include "gemm_reference_operation.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace library {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// A/B: s8
|
||||
// Acc : s32
|
||||
// C/D: some variance
|
||||
// Epi Scalar: some variance
|
||||
|
||||
// 1. s8_s8_s32_s32_s32 (s32 epi scalar)
|
||||
// 2. s8_s8_s32_s32_s32 (f32 epi scalar)
|
||||
// 3. s8_s8_s32_s8_s8 (f32 epi scalar)
|
||||
// 4. s8_s8_s32_s8_s8 (s32 epi scalar)
|
||||
// 5. s8_s8_s32_s32_s8 (f32 epi scalar)
|
||||
// 6. s8_s8_s32_f32_f32
|
||||
// 7. s8_s8_s32_f16_f16 (f32 epi scalar)
|
||||
|
||||
// D = convert( Scalar(alpha) * Scalar( A * B ) + Scalar(beta) * Scalar( C ) )
|
||||
// Convert: from epi Scalar dtype to D dtype
|
||||
|
||||
void initialize_gemm_reference_operations_s8_s8_s32(Manifest &manifest) {
|
||||
// 1.
|
||||
make_gemm_real_canonical_layouts<
|
||||
int8_t, // ElementA
|
||||
int8_t, // ElementB
|
||||
int32_t, // ElementC
|
||||
int32_t, // ElementScalar / ElementCompute
|
||||
int32_t, // ElementAccumulator
|
||||
int32_t // ElementD
|
||||
>(manifest);
|
||||
|
||||
// 2.
|
||||
make_gemm_real_canonical_layouts<
|
||||
int8_t, // ElementA
|
||||
int8_t, // ElementB
|
||||
int32_t, // ElementC
|
||||
int32_t, // ElementScalar / ElementCompute
|
||||
int32_t, // ElementAccumulator
|
||||
int32_t // ElementD
|
||||
>(manifest);
|
||||
|
||||
// 3.
|
||||
make_gemm_real_canonical_layouts<
|
||||
int8_t, // ElementA
|
||||
int8_t, // ElementB
|
||||
int8_t, // ElementC
|
||||
float, // ElementScalar / ElementCompute
|
||||
int32_t, // ElementAccumulator
|
||||
int8_t, // ElementD
|
||||
NumericConverterClamp<int8_t, float> // From Scalar to D
|
||||
>(manifest);
|
||||
|
||||
// 4.
|
||||
make_gemm_real_canonical_layouts<
|
||||
int8_t, // ElementA
|
||||
int8_t, // ElementB
|
||||
int8_t, // ElementC
|
||||
int32_t, // ElementScalar / ElementCompute
|
||||
int32_t, // ElementAccumulator
|
||||
int8_t, // ElementD
|
||||
NumericConverterClamp<int8_t, int32_t> // From Scalar to D
|
||||
>(manifest);
|
||||
|
||||
// 5.
|
||||
make_gemm_real_canonical_layouts<
|
||||
int8_t, // ElementA
|
||||
int8_t, // ElementB
|
||||
int32_t, // ElementC
|
||||
float, // ElementScalar / ElementCompute
|
||||
int32_t, // ElementAccumulator
|
||||
int8_t, // ElementD
|
||||
NumericConverterClamp<int8_t, float> // From Scalar to D
|
||||
>(manifest);
|
||||
|
||||
// 6.
|
||||
make_gemm_real_canonical_layouts<
|
||||
int8_t, // ElementA
|
||||
int8_t, // ElementB
|
||||
float, // ElementC
|
||||
float, // ElementScalar / ElementCompute
|
||||
int32_t, // ElementAccumulator
|
||||
float // ElementD
|
||||
>(manifest);
|
||||
|
||||
// 7.
|
||||
make_gemm_real_canonical_layouts<
|
||||
int8_t, // ElementA
|
||||
int8_t, // ElementB
|
||||
half_t, // ElementC
|
||||
float, // ElementScalar / ElementCompute
|
||||
int32_t, // ElementAccumulator
|
||||
half_t, // ElementD
|
||||
NumericConverterClamp<half_t, float> // From Scalar to D
|
||||
>(manifest);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace library
|
||||
} // namespace cutlass
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -45,72 +45,48 @@ namespace library {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void initialize_gemm_reference_operations_int8_canonical(Manifest &manifest) {
|
||||
// A/B: u8
|
||||
// Acc : s32
|
||||
// C/D: some variance
|
||||
|
||||
// 1. u8_u8_s32_s32_s32 (s32 epi scalar)
|
||||
// 2. u8_u8_s32_s32_s32 (f32 epi scalar)
|
||||
// 3. u8_8_s32_s8_s8 (f32 epi scalar)
|
||||
// 3. u8_8_s32_s8_s8 (s epi scalar)
|
||||
|
||||
void initialize_gemm_reference_operations_u8_u8_s32(Manifest &manifest) {
|
||||
// 1.
|
||||
make_gemm_real_canonical_layouts<
|
||||
int8_t,
|
||||
int8_t,
|
||||
int32_t,
|
||||
int32_t,
|
||||
int32_t
|
||||
uint8_t, // ElementA
|
||||
uint8_t, // ElementB
|
||||
int32_t, // ElementC
|
||||
int32_t, // ElementScalar / ElementCompute
|
||||
int32_t, // ElementAccumulator
|
||||
int32_t // ElementD
|
||||
>(manifest);
|
||||
|
||||
// 2.
|
||||
make_gemm_real_canonical_layouts<
|
||||
int8_t,
|
||||
int8_t,
|
||||
int8_t,
|
||||
float,
|
||||
int32_t,
|
||||
int8_t,
|
||||
NumericConverterClamp<int8_t, float>
|
||||
uint8_t, // ElementA
|
||||
uint8_t, // ElementB
|
||||
int32_t, // ElementC
|
||||
float, // ElementScalar / ElementCompute
|
||||
int32_t, // ElementAccumulator
|
||||
int32_t, // ElementD
|
||||
NumericConverterClamp<int32_t, float> // From Scalar to D
|
||||
>(manifest);
|
||||
|
||||
// 3.
|
||||
make_gemm_real_canonical_layouts<
|
||||
int8_t,
|
||||
int8_t,
|
||||
int32_t,
|
||||
float,
|
||||
int32_t,
|
||||
int32_t,
|
||||
NumericConverterClamp<int32_t, float>
|
||||
uint8_t, // ElementA
|
||||
uint8_t, // ElementB
|
||||
int8_t, // ElementC
|
||||
float, // ElementScalar / ElementCompute
|
||||
int32_t, // ElementAccumulator
|
||||
int8_t, // ElementD
|
||||
NumericConverterClamp<int8_t, float> // From Scalar to D
|
||||
>(manifest);
|
||||
|
||||
make_gemm_real_canonical_layouts<
|
||||
uint8_t,
|
||||
uint8_t,
|
||||
int32_t,
|
||||
int32_t,
|
||||
int32_t
|
||||
>(manifest);
|
||||
|
||||
make_gemm_real_canonical_layouts<
|
||||
uint8_t,
|
||||
uint8_t,
|
||||
int8_t,
|
||||
float,
|
||||
int32_t,
|
||||
int8_t,
|
||||
NumericConverterClamp<int8_t, float>
|
||||
>(manifest);
|
||||
|
||||
make_gemm_real_canonical_layouts<
|
||||
uint8_t,
|
||||
uint8_t,
|
||||
int32_t,
|
||||
float,
|
||||
int32_t,
|
||||
int32_t,
|
||||
NumericConverterClamp<int32_t, float>
|
||||
>(manifest);
|
||||
|
||||
make_gemm_real_canonical_layouts<
|
||||
int8_t,
|
||||
int8_t,
|
||||
int8_t,
|
||||
int32_t,
|
||||
int32_t,
|
||||
int8_t,
|
||||
NumericConverterClamp<int8_t, int32_t>
|
||||
>(manifest);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -46,7 +46,8 @@ namespace library {
|
||||
void initialize_gemm_reference_operations_int4(Manifest &manifest);
|
||||
void initialize_gemm_reference_operations_int8_interleaved_32(Manifest &manifest);
|
||||
void initialize_gemm_reference_operations_int8_interleaved_64(Manifest &manifest);
|
||||
void initialize_gemm_reference_operations_int8_canonical(Manifest &manifest);
|
||||
void initialize_gemm_reference_operations_s8_s8_s32(Manifest &manifest);
|
||||
void initialize_gemm_reference_operations_u8_u8_s32(Manifest &manifest);
|
||||
void initialize_gemm_reference_operations_e4m3a_e4m3out(Manifest &manifest);
|
||||
void initialize_gemm_reference_operations_e5m2a_e4m3out(Manifest &manifest);
|
||||
void initialize_gemm_reference_operations_e4m3a_e5m2out(Manifest &manifest);
|
||||
@ -72,7 +73,8 @@ void initialize_reference_operations(Manifest &manifest) {
|
||||
|
||||
initialize_gemm_reference_operations_int8_interleaved_32(manifest);
|
||||
initialize_gemm_reference_operations_int8_interleaved_64(manifest);
|
||||
initialize_gemm_reference_operations_int8_canonical(manifest);
|
||||
initialize_gemm_reference_operations_s8_s8_s32(manifest);
|
||||
initialize_gemm_reference_operations_u8_u8_s32(manifest);
|
||||
|
||||
initialize_gemm_reference_operations_e4m3a_e4m3out(manifest);
|
||||
initialize_gemm_reference_operations_e5m2a_e4m3out(manifest);
|
||||
@ -85,7 +87,6 @@ void initialize_reference_operations(Manifest &manifest) {
|
||||
initialize_gemm_reference_operations_fp32out(manifest);
|
||||
initialize_gemm_reference_operations_fp_other(manifest);
|
||||
initialize_gemm_reference_operations_fp_mixed_input(manifest);
|
||||
|
||||
initialize_gemm_reference_operations_int_mixed_input(manifest);
|
||||
|
||||
}
|
||||
|
||||
445
tools/library/src/sparse_gemm_operation_3x.hpp
Normal file
445
tools/library/src/sparse_gemm_operation_3x.hpp
Normal file
@ -0,0 +1,445 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Defines operations for all GEMM operation kinds in CUTLASS Library.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/library/library.h"
|
||||
#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp" // StructuredSparseCompressor
|
||||
#include "cutlass/transform/device/transform_universal_adapter.hpp" // TransformUniversalAdapter
|
||||
#include "cutlass/util/packed_stride.hpp" // make_cute_packed_stride
|
||||
#include "gemm_operation_3x.hpp"
|
||||
#include "library_internal.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define CUDA_CHECK(cuda_error) \
|
||||
{ \
|
||||
if (cuda_error != cudaSuccess) { \
|
||||
printf("cudaError %s in %s:%d\n", cudaGetErrorString(cuda_error), __func__, __LINE__ ); \
|
||||
return Status::kInvalid; \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace cutlass::library {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Limitation & Assumptions:
|
||||
// 1. The tensor must be densely packed. That is, lda is k if the tensor is k-major,
|
||||
// and lda is m if the tensor is m-major.
|
||||
// 2. Circular buffer for tensorA and tensorE may have a less count compared to tensorB and others.
|
||||
// This is because we can not get the problem_count information in the get_device_workspace_size().
|
||||
// But I can promise it will use at least 192MB memory if we enable circular buffer.
|
||||
template <typename Operator_>
|
||||
class SparseGemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
|
||||
public:
|
||||
|
||||
using Operator = Operator_;
|
||||
using OperatorArguments = typename Operator::Arguments;
|
||||
using ElementA = typename Operator::ElementA;
|
||||
using LayoutA = typename Operator::LayoutA;
|
||||
using ElementB = typename Operator::ElementB;
|
||||
using LayoutB = typename Operator::LayoutB;
|
||||
using ElementC = typename Operator::ElementC;
|
||||
using LayoutC = typename Operator::LayoutC;
|
||||
using ElementD = typename Operator::ElementD;
|
||||
using LayoutD = typename Operator::LayoutD;
|
||||
using ElementAccumulator = typename Operator::ElementAccumulator;
|
||||
using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
|
||||
|
||||
using CollectiveMainloop = typename Operator::CollectiveMainloop;
|
||||
using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
|
||||
using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
|
||||
|
||||
using ElementE = typename CollectiveMainloop::ElementE;
|
||||
using LayoutE = typename CollectiveMainloop::LayoutE;
|
||||
using SparseConfig = typename CollectiveMainloop::SparseConfig;
|
||||
using LayoutATag = decltype(SparseConfig::deduce_layoutA_tag(typename CollectiveMainloop::LayoutA{}));
|
||||
using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
|
||||
cute::Shape<int, int, int, int>,
|
||||
ElementA,
|
||||
LayoutATag,
|
||||
SparseConfig>;
|
||||
using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
|
||||
cute::Shape<int, int, int, int>,
|
||||
ElementA,
|
||||
LayoutATag,
|
||||
SparseConfig,
|
||||
typename Operator::ArchTag>;
|
||||
|
||||
using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
|
||||
|
||||
public:
|
||||
|
||||
/// Constructor
|
||||
SparseGemmUniversal3xOperation(char const *name = "unknown_gemm"):
|
||||
GemmOperation3xBase<Operator_>(name, GemmKind::kUniversal) {}
|
||||
|
||||
protected:
|
||||
|
||||
/// Constructs the arguments structure given the configuration and arguments
|
||||
static Status construct_arguments_(
|
||||
OperatorArguments &operator_args, GemmUniversalConfiguration const *configuration) {
|
||||
// NOTE: GemmUniversalConfiguration does not contain problem shapes or batch strides
|
||||
// Do nothing here and construct kernel arguments in update_arguments_ instead
|
||||
// We also cannot construct TMA descriptors without all the arguments available
|
||||
|
||||
operator_args.mode = configuration->mode;
|
||||
return Status::kSuccess;
|
||||
}
|
||||
|
||||
template<class FusionArgs, class = void>
|
||||
struct UpdateFusionArgs {
|
||||
static Status update_(FusionArgs const& fusion_args, GemmUniversalArguments const &arguments) {
|
||||
// If a custom EVT is instantiated then it is the users's responsibility
|
||||
// to ensure alpha and beta are updated appropriately
|
||||
return Status::kSuccess;
|
||||
}
|
||||
};
|
||||
|
||||
template<class FusionArgs>
|
||||
struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
|
||||
static Status update_(FusionArgs& fusion_args, GemmUniversalArguments const &arguments) {
|
||||
if (arguments.pointer_mode == ScalarPointerMode::kHost) {
|
||||
fusion_args.alpha = *static_cast<ElementCompute const *>(arguments.alpha);
|
||||
fusion_args.beta = *static_cast<ElementCompute const *>(arguments.beta);
|
||||
fusion_args.alpha_ptr = nullptr;
|
||||
fusion_args.beta_ptr = nullptr;
|
||||
|
||||
return Status::kSuccess;
|
||||
}
|
||||
else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
|
||||
fusion_args.alpha = 0;
|
||||
fusion_args.beta = 0;
|
||||
fusion_args.alpha_ptr = static_cast<ElementCompute const *>(arguments.alpha);
|
||||
fusion_args.beta_ptr = static_cast<ElementCompute const *>(arguments.beta);
|
||||
|
||||
return Status::kSuccess;
|
||||
}
|
||||
else {
|
||||
return Status::kErrorInvalidProblem;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Constructs the arguments structure given the configuration and arguments
|
||||
static Status update_arguments_(
|
||||
OperatorArguments &operator_args,
|
||||
GemmUniversalArguments const *arguments,
|
||||
CompressorUtility const& compressor_utility,
|
||||
void* device_a_compressed_ptr = nullptr,
|
||||
void* device_e_ptr = nullptr) {
|
||||
Status status = Status::kSuccess;
|
||||
|
||||
status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
|
||||
operator_args.epilogue.thread, *arguments);
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// TODO: type erase Arguments structure in 3.0 GEMM
|
||||
operator_args.problem_shape = cute::make_shape(
|
||||
arguments->problem_size.m(),
|
||||
arguments->problem_size.n(),
|
||||
arguments->problem_size.k(),
|
||||
arguments->batch_count);
|
||||
|
||||
// update arguments
|
||||
operator_args.mainloop.ptr_A = reinterpret_cast<ElementA const *>(device_a_compressed_ptr);
|
||||
operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
|
||||
operator_args.mainloop.ptr_E = reinterpret_cast<ElementE const *>(device_e_ptr);
|
||||
operator_args.epilogue.ptr_C = static_cast<ElementC const *>(arguments->C);
|
||||
operator_args.epilogue.ptr_D = static_cast<ElementD *>(arguments->D);
|
||||
|
||||
operator_args.mainloop.layout_a = compressor_utility.fill_layoutA_from_compressor();
|
||||
operator_args.mainloop.layout_e = compressor_utility.fill_layoutE_from_compressor();
|
||||
operator_args.mainloop.dB = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideB>(
|
||||
arguments->ldb, arguments->batch_stride_B);
|
||||
operator_args.epilogue.dC = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideC>(
|
||||
arguments->ldc, arguments->batch_stride_C);
|
||||
operator_args.epilogue.dD = operator_args.epilogue.dC;
|
||||
|
||||
/* Query device SM count to pass onto the kernel as an argument, where needed */
|
||||
operator_args.hw_info.sm_count = arguments->sm_count;
|
||||
if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
|
||||
operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
|
||||
}
|
||||
|
||||
if constexpr (!std::is_const_v<decltype(operator_args.scheduler.raster_order)>) {
|
||||
using Enum_t = decltype(operator_args.scheduler.raster_order);
|
||||
switch (arguments->raster_order) {
|
||||
case RasterOrder::kAlongN:
|
||||
operator_args.scheduler.raster_order = Enum_t::AlongN;
|
||||
break;
|
||||
case RasterOrder::kAlongM:
|
||||
operator_args.scheduler.raster_order = Enum_t::AlongM;
|
||||
break;
|
||||
default:
|
||||
operator_args.scheduler.raster_order = Enum_t::Heuristic;
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
/// Returns success if the operation can proceed
|
||||
Status can_implement(
|
||||
void const *configuration_ptr, void const *arguments_ptr) const override {
|
||||
|
||||
GemmUniversalConfiguration const *configuration =
|
||||
static_cast<GemmUniversalConfiguration const *>(configuration_ptr);
|
||||
GemmUniversalArguments const *arguments =
|
||||
static_cast<GemmUniversalArguments const *>(arguments_ptr);
|
||||
|
||||
OperatorArguments args;
|
||||
auto problem_shape_MNKL = cute::make_shape(
|
||||
configuration->problem_size.m(),
|
||||
configuration->problem_size.n(),
|
||||
configuration->problem_size.k(),
|
||||
configuration->batch_count);
|
||||
|
||||
const int M = configuration->problem_size.m();
|
||||
const int N = configuration->problem_size.n();
|
||||
const int K = configuration->problem_size.k();
|
||||
const int L = configuration->batch_count;
|
||||
using StrideA = typename CompressorUtility::StrideA;
|
||||
auto dA = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
|
||||
compressor_utility.set_problem_size(problem_shape_MNKL, dA);
|
||||
auto status = update_arguments_(args, arguments, compressor_utility);
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
// can_implement rules may need access to problem shape
|
||||
args.problem_shape = problem_shape_MNKL;
|
||||
return Operator::can_implement(args);
|
||||
}
|
||||
|
||||
/// Gets the host-side workspace
|
||||
uint64_t get_host_workspace_size(void const *) const override {
|
||||
// Memory to hold operator
|
||||
host_op_workspace_size = sizeof(Operator);
|
||||
|
||||
// Memory to hold result of `.structure_sparse_zero_mask_fill()`
|
||||
tensor_a_size = compressor_utility.get_raw_tensor_A_bytes();
|
||||
|
||||
// NOTE: order here is the order of workspace partition
|
||||
const uint64_t size = host_op_workspace_size + tensor_a_size;
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
/// Gets the device-side workspace
|
||||
uint64_t get_device_workspace_size(
|
||||
void const *configuration_ptr,void const *arguments_ptr) const override {
|
||||
|
||||
OperatorArguments args;
|
||||
auto status = update_arguments_(
|
||||
args, static_cast<GemmUniversalArguments const *>(arguments_ptr), compressor_utility);
|
||||
if (status != Status::kSuccess) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
typename Compressor::Arguments compress_arguments {
|
||||
{compressor_utility.M, 0, compressor_utility.K, compressor_utility.L},
|
||||
{/*Empty Not Use*/},
|
||||
{/*Empty Not Use*/} };
|
||||
|
||||
// Size for one iteration
|
||||
// For multi-iteration, will need to multiply result of this function w/ actual problem_count
|
||||
tensor_ac_size = compressor_utility.get_compressed_tensor_A_bytes();
|
||||
tensor_e_size = compressor_utility.get_tensor_E_bytes();
|
||||
device_op_workspace_size = Operator::get_workspace_size(args);
|
||||
device_compress_workspace_size = Compressor::get_workspace_size(compress_arguments);
|
||||
|
||||
// NOTE: order here is the order of workspace partition
|
||||
device_per_iter_workspace_size = device_op_workspace_size + device_compress_workspace_size + tensor_ac_size + tensor_e_size;
|
||||
|
||||
return device_per_iter_workspace_size;
|
||||
}
|
||||
|
||||
/// Initializes the workspace
|
||||
Status initialize(
|
||||
void const *configuration_ptr,
|
||||
void *host_workspace,
|
||||
void *device_workspace,
|
||||
cudaStream_t stream = nullptr) const override {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
|
||||
Status initialize_with_profiler_workspace(
|
||||
void const *configuration,
|
||||
void *host_workspace,
|
||||
void *device_workspace,
|
||||
uint8_t **profiler_workspaces,
|
||||
int problem_count_from_profiler,
|
||||
cudaStream_t stream = nullptr) {
|
||||
|
||||
// Set problem_count.
|
||||
problem_count = problem_count_from_profiler;
|
||||
|
||||
// * Host Ptr
|
||||
auto* host_op_workspace_ptr = reinterpret_cast<uint8_t*>(host_workspace);
|
||||
auto* host_a_raw_ptr = host_op_workspace_ptr + host_op_workspace_size;
|
||||
|
||||
// * Construct Op
|
||||
Operator *op = new (host_op_workspace_ptr) Operator;
|
||||
|
||||
// * Device Full Ptr
|
||||
device_full_ptr = reinterpret_cast<uint8_t*>(device_workspace);
|
||||
|
||||
// * Device Ptr (1st iteration)
|
||||
// Device workspace : | iter1 | iter2 | iter3 | .. | iterx |
|
||||
// iteri : op_workspace | tensor_ac | tensor_e
|
||||
auto* device_ptr_iter1 = device_full_ptr;
|
||||
auto* device_op_workspace_ptr_iter1 = device_ptr_iter1;
|
||||
auto* device_compressor_workspace_ptr_iter1 = device_op_workspace_ptr_iter1 + device_op_workspace_size;
|
||||
auto* device_a_compressed_ptr_iter1 = device_compressor_workspace_ptr_iter1 + device_compress_workspace_size;
|
||||
auto* device_e_ptr_iter1 = device_a_compressed_ptr_iter1 + tensor_ac_size;
|
||||
|
||||
// * Device A Raw Ptr
|
||||
auto* device_a_raw_ptr = profiler_workspaces[0];
|
||||
|
||||
// * Random fill 50% of TensorA w/ zero following the structured sparse requirement
|
||||
cudaMemcpy(host_a_raw_ptr, device_a_raw_ptr, tensor_a_size, cudaMemcpyDeviceToHost);
|
||||
compressor_utility.structure_sparse_zero_mask_fill(host_a_raw_ptr, 2000);
|
||||
cudaMemcpy(device_a_raw_ptr, host_a_raw_ptr, tensor_a_size, cudaMemcpyHostToDevice);
|
||||
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
// * Compress DTensorA and get DTensorAC & DTensorE
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
hw_info.device_id = 0;
|
||||
hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
|
||||
typename Compressor::Arguments arguments{
|
||||
{compressor_utility.M, 0, compressor_utility.K, compressor_utility.L},
|
||||
{device_a_raw_ptr,
|
||||
compressor_utility.dA,
|
||||
device_a_compressed_ptr_iter1,
|
||||
device_e_ptr_iter1},
|
||||
{hw_info}
|
||||
};
|
||||
|
||||
cutlass::Status status {cutlass::Status::kSuccess };
|
||||
|
||||
Compressor compressor_op;
|
||||
status = compressor_op.can_implement(arguments);
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
status = compressor_op.initialize(arguments, device_compressor_workspace_ptr_iter1, stream);
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
status = compressor_op.run(stream);
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
|
||||
// * Copy Iter1's DTensorAC DTensorE to each iteration's DTensorAC DTensorE
|
||||
for (int iter_i = 1; iter_i < problem_count; iter_i++) {
|
||||
// * Device AC E Ptr per iteration
|
||||
// Device workspace : | iter1 | iter2 | iter3 | .. | iterx |
|
||||
// iteri : op_workspace | tensor_ac | tensor_e
|
||||
auto* device_ptr_iteri = device_full_ptr + device_per_iter_workspace_size * iter_i;
|
||||
auto* device_op_workspace_ptr = device_ptr_iteri;
|
||||
auto* device_compressor_workspace_ptr = device_op_workspace_ptr + device_op_workspace_size;
|
||||
auto* device_a_compressed_ptr = device_compressor_workspace_ptr + device_compress_workspace_size;
|
||||
auto* device_e_ptr = device_a_compressed_ptr + tensor_ac_size;
|
||||
|
||||
cudaMemcpy(device_a_compressed_ptr, device_a_compressed_ptr_iter1, tensor_ac_size, cudaMemcpyDeviceToDevice);
|
||||
cudaMemcpy(device_e_ptr, device_e_ptr_iter1, tensor_e_size, cudaMemcpyDeviceToDevice);
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
return Status::kSuccess;
|
||||
}
|
||||
|
||||
/// Runs the kernel
|
||||
Status run(
|
||||
void const *arguments_ptr,
|
||||
void *host_workspace,
|
||||
void *device_workspace = nullptr,
|
||||
cudaStream_t stream = nullptr) const override {
|
||||
|
||||
OperatorArguments operator_args;
|
||||
|
||||
auto* device_ptr_iteri = device_full_ptr + device_per_iter_workspace_size * iter_idx;
|
||||
auto* device_op_workspace_ptr = device_ptr_iteri;
|
||||
auto* device_compressor_workspace_ptr = device_op_workspace_ptr + device_op_workspace_size;
|
||||
auto* device_a_compressed_ptr = device_compressor_workspace_ptr + device_compress_workspace_size;
|
||||
auto* device_e_ptr = device_a_compressed_ptr + tensor_ac_size;
|
||||
iter_idx = (iter_idx + 1) % problem_count;
|
||||
|
||||
Status status = update_arguments_(operator_args, static_cast<GemmUniversalArguments const *>(arguments_ptr), compressor_utility, device_a_compressed_ptr, device_e_ptr );
|
||||
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
Operator *op = static_cast<Operator *>(host_workspace);
|
||||
// We need to call initialize() since we have to rebuild TMA desc for every new set of args
|
||||
status = op->run(operator_args, device_op_workspace_ptr, stream);
|
||||
return status;
|
||||
}
|
||||
|
||||
private:
|
||||
// Variables that must change in the const functions.
|
||||
mutable CompressorUtility compressor_utility;
|
||||
mutable int problem_count = 1;
|
||||
mutable int iter_idx = 0;
|
||||
|
||||
uint8_t* device_full_ptr = nullptr;
|
||||
|
||||
mutable uint64_t tensor_ac_size = 0;
|
||||
mutable uint64_t tensor_e_size = 0;
|
||||
mutable uint64_t tensor_a_size = 0;
|
||||
mutable uint64_t host_op_workspace_size = 0;
|
||||
mutable uint64_t device_compress_workspace_size = 0;
|
||||
mutable uint64_t device_op_workspace_size = 0;
|
||||
mutable uint64_t device_per_iter_workspace_size = 0;
|
||||
};
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace cutlass::library
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -756,6 +756,7 @@ OpcodeClassID_enumerants[] = {
|
||||
{"tensorop", "<tensorop>", OpcodeClassID::kTensorOp},
|
||||
{"wmmatensorop", "<wmmatensorop>", OpcodeClassID::kWmmaTensorOp},
|
||||
{"wmma", "<wmma>", OpcodeClassID::kWmmaTensorOp},
|
||||
{"sptensorop", "<sptensorop>", OpcodeClassID::kSparseTensorOp}
|
||||
};
|
||||
|
||||
/// Converts a OpcodeClassID enumerant to a string
|
||||
|
||||
@ -36,6 +36,7 @@
|
||||
|
||||
#if CUTLASS_ENABLE_CUBLAS
|
||||
#include <cublas_v2.h>
|
||||
#include <cublasLt.h>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/library/library.h"
|
||||
@ -90,25 +91,48 @@ Status cublas_satisfies(library::SymmDescription const &desc);
|
||||
/// Additionally, it provides implicit cast from CublasCreate's object to cublasHandle_t's object
|
||||
class CublasCreate {
|
||||
private:
|
||||
cublasHandle_t handle;
|
||||
cublasStatus_t status;
|
||||
cublasHandle_t handle;
|
||||
cublasStatus_t status;
|
||||
|
||||
public:
|
||||
CublasCreate() {
|
||||
status = cublasCreate(&handle);
|
||||
}
|
||||
CublasCreate() {
|
||||
status = cublasCreate(&handle);
|
||||
}
|
||||
|
||||
~CublasCreate() {
|
||||
cublasDestroy(handle);
|
||||
}
|
||||
~CublasCreate() {
|
||||
cublasDestroy(handle);
|
||||
}
|
||||
|
||||
/// Implicit cast CublasCreate object to cublasHandle_t
|
||||
operator cublasHandle_t() const { return handle; }
|
||||
/// Implicit cast CublasCreate object to cublasHandle_t
|
||||
operator cublasHandle_t() const { return handle; }
|
||||
|
||||
/// returns cublasStatus_t for handle creation
|
||||
cublasStatus_t get_cublas_create_status() { return status; }
|
||||
/// returns cublasStatus_t for handle creation
|
||||
cublasStatus_t get_cublas_create_status() { return status; }
|
||||
};
|
||||
|
||||
/// This is a helper class to create cublasLtHandle_t automatically on CublasLtCreate object creation and
|
||||
/// to destroy cublasLtHandle_t on CublasLtCreate object destruction.
|
||||
/// Additionally, it provides implicit cast from CublasLtCreate's object to cublasLtHandle_t's object
|
||||
class CublasLtCreate {
|
||||
private:
|
||||
cublasLtHandle_t handle;
|
||||
cublasStatus_t status;
|
||||
|
||||
public:
|
||||
CublasLtCreate() {
|
||||
status = cublasLtCreate(&handle);
|
||||
}
|
||||
|
||||
~CublasLtCreate() {
|
||||
cublasLtDestroy(handle);
|
||||
}
|
||||
|
||||
/// Implicit cast CublasLtCreate object to cublasLtHandle_t
|
||||
operator cublasLtHandle_t() const { return handle; }
|
||||
|
||||
/// returns cublasLtStatus_t for handle creation
|
||||
cublasStatus_t get_cublaslt_create_status() { return status; }
|
||||
};
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace detail {
|
||||
@ -226,6 +250,80 @@ struct cublasGemmExDispatcher {
|
||||
cublasStatus_t operator()(cublasHandle_t handle);
|
||||
};
|
||||
|
||||
/// Dispatcher to cublaslt kernels
|
||||
//
|
||||
struct cublasLtGemmExDispatcher {
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
library::GemmDescription const &op_desc;
|
||||
library::GemmUniversalConfiguration configuration;
|
||||
library::GemmUniversalArguments arguments;
|
||||
|
||||
// cublas-specific data structures to fill cublas API call arguments
|
||||
cublasOperation_t trans_A;
|
||||
cublasOperation_t trans_B;
|
||||
cudaDataType_t data_type_A;
|
||||
cudaDataType_t data_type_B;
|
||||
cudaDataType_t data_type_C;
|
||||
cudaDataType_t compute_data_type = CUDA_R_32F;
|
||||
|
||||
//cublasLt-specific data structures
|
||||
cublasLtMatmulDesc_t operationDesc = NULL;
|
||||
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
|
||||
cublasLtMatmulPreference_t preference = NULL;
|
||||
|
||||
//is set by call to get_cublaslt_algo()
|
||||
cublasLtMatmulHeuristicResult_t heuristicResult_;
|
||||
void *workspace = nullptr;
|
||||
|
||||
Status status;
|
||||
|
||||
#if (__CUDACC_VER_MAJOR__ >= 11)
|
||||
cublasComputeType_t compute_type;
|
||||
#endif
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
cublasLtGemmExDispatcher(
|
||||
library::GemmDescription const &op_desc,
|
||||
library::GemmUniversalConfiguration configuration_,
|
||||
library::GemmUniversalArguments arguments_
|
||||
);
|
||||
|
||||
/// Initialize the cublasLt variables
|
||||
void initialize_cublaslt();
|
||||
|
||||
|
||||
/// Runs auto-tuning for the cublas heuristics
|
||||
bool get_cublaslt_algo(cublasLtHandle_t handle,
|
||||
AlgorithmMode algorithm_mode
|
||||
);
|
||||
|
||||
/// Executes GEMM using these arguments
|
||||
cublasStatus_t operator()(cublasLtHandle_t handle);
|
||||
|
||||
~cublasLtGemmExDispatcher(){
|
||||
|
||||
// descriptors are no longer needed as all GPU work was already enqueued
|
||||
if (preference) cublasLtMatmulPreferenceDestroy(preference);
|
||||
if (Ddesc) cublasLtMatrixLayoutDestroy(Ddesc);
|
||||
if (Cdesc) cublasLtMatrixLayoutDestroy(Cdesc);
|
||||
if (Bdesc) cublasLtMatrixLayoutDestroy(Bdesc);
|
||||
if (Adesc) cublasLtMatrixLayoutDestroy(Adesc);
|
||||
if (operationDesc) cublasLtMatmulDescDestroy(operationDesc);
|
||||
|
||||
if (workspace) {
|
||||
cudaFree(workspace);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Dispatcher to cublas rank k update kernels
|
||||
|
||||
@ -48,7 +48,7 @@ namespace profiler {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// CUTLASS Profiler application
|
||||
/// CUTLASS Profiler application
|
||||
class CutlassProfiler {
|
||||
private:
|
||||
|
||||
@ -66,13 +66,10 @@ private:
|
||||
|
||||
/// Prints usage
|
||||
void print_usage_(std::ostream &);
|
||||
|
||||
|
||||
/// Prints usage
|
||||
void print_options_(std::ostream &);
|
||||
|
||||
/// Initializes the device
|
||||
void initialize_device_();
|
||||
|
||||
/// Enumerates all operations
|
||||
void enumerate_();
|
||||
|
||||
|
||||
@ -81,6 +81,9 @@ private:
|
||||
/// Buffer holding TensorRef instance to recently allocated memory
|
||||
std::vector<uint8_t> tensor_ref_buffer_;
|
||||
|
||||
/// The device ID where the allocation is made
|
||||
int device_;
|
||||
|
||||
public:
|
||||
//
|
||||
// Static member functions
|
||||
@ -91,7 +94,7 @@ public:
|
||||
|
||||
/// Returns the stride of a packed layout
|
||||
static std::vector<int64_t> get_packed_layout(
|
||||
library::LayoutTypeID layout_id,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent);
|
||||
|
||||
/// returns the capacity needed
|
||||
@ -103,16 +106,16 @@ public:
|
||||
|
||||
/// Returns true if two blocks have exactly the same value
|
||||
static bool block_compare_equal(
|
||||
library::NumericTypeID numeric_type,
|
||||
void const *ptr_A,
|
||||
void const *ptr_B,
|
||||
library::NumericTypeID numeric_type,
|
||||
void const *ptr_A,
|
||||
void const *ptr_B,
|
||||
size_t capacity);
|
||||
|
||||
/// Returns true if two blocks have approximately the same value
|
||||
static bool block_compare_relatively_equal(
|
||||
library::NumericTypeID numeric_type,
|
||||
void const *ptr_A,
|
||||
void const *ptr_B,
|
||||
library::NumericTypeID numeric_type,
|
||||
void const *ptr_A,
|
||||
void const *ptr_B,
|
||||
size_t capacity,
|
||||
double epsilon,
|
||||
double nonzero_floor);
|
||||
@ -123,15 +126,19 @@ public:
|
||||
//
|
||||
|
||||
DeviceAllocation();
|
||||
|
||||
DeviceAllocation(library::NumericTypeID type, size_t capacity);
|
||||
|
||||
|
||||
DeviceAllocation(
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
library::NumericTypeID type,
|
||||
size_t capacity,
|
||||
int device = -1);
|
||||
|
||||
DeviceAllocation(
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int64_t> const &stride = std::vector<int64_t>(),
|
||||
int batch_count = 1);
|
||||
int batch_count = 1,
|
||||
int device = -1);
|
||||
|
||||
~DeviceAllocation();
|
||||
|
||||
@ -142,9 +149,9 @@ public:
|
||||
|
||||
/// Allocates memory for a given layout and tensor
|
||||
DeviceAllocation &reset(
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int64_t> const &stride = std::vector<int64_t>(),
|
||||
int batch_count = 1);
|
||||
|
||||
@ -157,7 +164,7 @@ public:
|
||||
|
||||
/// Data type of contained elements
|
||||
library::NumericTypeID type() const;
|
||||
|
||||
|
||||
/// Pointer to start of device memory allocation
|
||||
void *data() const;
|
||||
|
||||
@ -184,7 +191,7 @@ public:
|
||||
|
||||
/// Capacity of allocation in number of elements
|
||||
size_t capacity() const;
|
||||
|
||||
|
||||
/// Capacity of allocation in bytes
|
||||
size_t bytes() const;
|
||||
|
||||
@ -205,7 +212,7 @@ public:
|
||||
|
||||
/// Initializes a host allocation to a random distribution using std::cout
|
||||
void initialize_random_sparsemeta_host(int seed, int MetaSizeInBits);
|
||||
|
||||
|
||||
/// Uniformly fills a tensor with a value when provided o.w. zero
|
||||
void fill_device(double value);
|
||||
|
||||
@ -221,8 +228,12 @@ public:
|
||||
/// Copies from an equivalent-sized tensor in device memory
|
||||
void copy_to_host(void *ptr);
|
||||
|
||||
/// Writes a tensor to csv
|
||||
/// Writes a tensor to csv
|
||||
void write_tensor_csv(std::ostream &out);
|
||||
|
||||
private:
|
||||
/// A wrapper that sets the device, performs malloc, and sets back
|
||||
cudaError_t malloc(void** ptr, size_t size);
|
||||
};
|
||||
|
||||
using DeviceAllocationList = std::list<DeviceAllocation>;
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief
|
||||
\brief
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
@ -68,46 +68,52 @@ private:
|
||||
|
||||
/// Non-owning set of named allocations
|
||||
AllocationMap allocations_;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
/// Allocates memory of a given type, capacity (elements), and name
|
||||
DeviceAllocation *allocate_block(
|
||||
Options const &options,
|
||||
std::string const &name,
|
||||
library::NumericTypeID type,
|
||||
size_t capacity);
|
||||
|
||||
/// Allocates memory of a given type, capacity (elements), and name
|
||||
DeviceAllocation *allocate_tensor(
|
||||
std::string const &name,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int64_t> const &stride = std::vector<int64_t>(),
|
||||
int batch_count = 1);
|
||||
library::NumericTypeID type,
|
||||
size_t capacity,
|
||||
size_t device_index);
|
||||
|
||||
/// Allocates memory of a given type, capacity (elements), and name
|
||||
DeviceAllocation *allocate_tensor(
|
||||
Options const &options,
|
||||
std::string const &name,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count,
|
||||
int seed_shift = 0);
|
||||
size_t device_index);
|
||||
|
||||
/// Allocates memory for sparse meta data
|
||||
DeviceAllocation *allocate_sparsemeta_tensor(
|
||||
/// Allocates memory of a given type, capacity (elements), and name
|
||||
DeviceAllocation *allocate_and_initialize_tensor(
|
||||
Options const &options,
|
||||
std::string const &name,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count,
|
||||
int seed_shift,
|
||||
size_t device_index);
|
||||
|
||||
/// Allocates memory for sparse meta data
|
||||
DeviceAllocation *allocate_and_initialize_sparsemeta_tensor(
|
||||
Options const &options,
|
||||
std::string const &name,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
library::NumericTypeID type_a,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count,
|
||||
int seed_shift = 0);
|
||||
int seed_shift,
|
||||
size_t device_index);
|
||||
|
||||
/// Clears named allocations (but does not necessarily free memory)
|
||||
void clear();
|
||||
|
||||
@ -82,12 +82,16 @@ public:
|
||||
struct Device {
|
||||
|
||||
/// Device ID
|
||||
int device;
|
||||
std::vector<int> devices;
|
||||
|
||||
/// Number of total devices
|
||||
/// This is not set by the user, it is set by automatically
|
||||
int num_devices;
|
||||
|
||||
/// CUDA Device properties
|
||||
cudaDeviceProp properties;
|
||||
std::vector<cudaDeviceProp> properties;
|
||||
|
||||
/// Total memory allocation on device
|
||||
/// Total memory allocation on each device
|
||||
size_t maximum_capacity;
|
||||
|
||||
//
|
||||
@ -100,8 +104,11 @@ public:
|
||||
void print_options(std::ostream &out, int indent = 0) const;
|
||||
void print_device_info(std::ostream &out) const;
|
||||
|
||||
/// Returns the compute capability of the listed device (e.g. 61, 60, 70, 75)
|
||||
int compute_capability() const;
|
||||
/// Returns the device ID from a device index
|
||||
int device_id(size_t device_index) const;
|
||||
|
||||
/// Returns the compute capability of the listed devices (e.g. 61, 60, 70, 75)
|
||||
int compute_capability(int device_index) const;
|
||||
};
|
||||
|
||||
/// Options related to initializing input tensors
|
||||
@ -129,7 +136,7 @@ public:
|
||||
//
|
||||
|
||||
explicit Initialization(CommandLine const &cmdline);
|
||||
|
||||
|
||||
void print_usage(std::ostream &out) const;
|
||||
void print_options(std::ostream &out, int indent = 0) const;
|
||||
|
||||
@ -171,13 +178,13 @@ public:
|
||||
//
|
||||
|
||||
explicit Verification(CommandLine const &cmdline);
|
||||
|
||||
|
||||
void print_usage(std::ostream &out) const;
|
||||
void print_options(std::ostream &out, int indent = 0) const;
|
||||
|
||||
/// Returns true if a provider is enabled
|
||||
bool provider_enabled(library::Provider provider) const;
|
||||
|
||||
|
||||
/// Returns the index of a provider if its enabled
|
||||
size_t index(library::Provider provider) const;
|
||||
};
|
||||
@ -225,7 +232,7 @@ public:
|
||||
/// Returns the index of a provider if its enabled
|
||||
size_t index(library::Provider provider) const;
|
||||
};
|
||||
|
||||
|
||||
/// Options related to reporting
|
||||
struct Report {
|
||||
|
||||
@ -260,7 +267,7 @@ public:
|
||||
//
|
||||
|
||||
explicit Report(CommandLine const &cmdline);
|
||||
|
||||
|
||||
void print_usage(std::ostream &out) const;
|
||||
void print_options(std::ostream &out, int indent = 0) const;
|
||||
};
|
||||
@ -282,7 +289,7 @@ public:
|
||||
//
|
||||
|
||||
explicit About(CommandLine const &cmdline);
|
||||
|
||||
|
||||
void print_usage(std::ostream &out) const;
|
||||
void print_options(std::ostream &out, int indent = 0) const;
|
||||
|
||||
@ -303,7 +310,7 @@ public:
|
||||
|
||||
/// Vector of operation name substrings
|
||||
std::vector<std::string> operation_names;
|
||||
|
||||
|
||||
/// Vector of operation name substrings
|
||||
std::vector<std::string> excluded_operation_names;
|
||||
|
||||
|
||||
@ -51,10 +51,10 @@ namespace profiler {
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Ctor
|
||||
Conv2dOperationProfiler::Conv2dOperationProfiler(Options const &options):
|
||||
Conv2dOperationProfiler::Conv2dOperationProfiler(Options const &options):
|
||||
OperationProfiler(
|
||||
options,
|
||||
library::OperationKind::kConv2d,
|
||||
library::OperationKind::kConv2d,
|
||||
{
|
||||
{ArgumentTypeID::kEnumerated, {"conv_kind"}, "Convolutional operator (fprop, dgrad, wgrad)"},
|
||||
{ArgumentTypeID::kInteger, {"n", "input_n"}, "Input N dimension of the Conv2d problem space"},
|
||||
@ -165,13 +165,13 @@ int64_t Conv2dOperationProfiler::Conv2dProblem::flops(
|
||||
|
||||
int64_t flops_mainloop_ = int64_t(mnk.m()) * mnk.n() * mnk.k() * 2;
|
||||
int64_t flops_epilogue_ = int64_t(mnk.m()) * int64_t(mnk.n()) * 2;
|
||||
|
||||
|
||||
// Adjust mainloop flop for dgrad strided
|
||||
if (operation_desc.conv_kind == library::ConvKind::kDgrad) {
|
||||
flops_mainloop_ = flops_mainloop_ / (stride_h * stride_w);
|
||||
}
|
||||
int64_t flops_total_ = flops_mainloop_ + flops_epilogue_;
|
||||
|
||||
|
||||
//complex-valued support
|
||||
switch (operation_desc.tile_description.math_instruction.math_operation) {
|
||||
case library::MathOperationID::kMultiplyAddComplex:
|
||||
@ -188,14 +188,14 @@ int64_t Conv2dOperationProfiler::Conv2dProblem::flops(
|
||||
|
||||
/// Extracts the problem dimensions
|
||||
Status Conv2dOperationProfiler::initialize_configuration(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::ConvDescription const &operation_desc =
|
||||
library::ConvDescription const &operation_desc =
|
||||
static_cast<library::ConvDescription const &>(operation->description());
|
||||
|
||||
if (!arg_as_int(problem_.n, "n", problem_space, problem)) {
|
||||
@ -207,7 +207,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
|
||||
// default value
|
||||
problem_.h = 16;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(problem_.w, "w", problem_space, problem)) {
|
||||
// default value
|
||||
problem_.w = 16;
|
||||
@ -227,7 +227,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
|
||||
// default value
|
||||
problem_.r = 3;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(problem_.s, "s", problem_space, problem)) {
|
||||
// default value
|
||||
problem_.s = 3;
|
||||
@ -280,14 +280,14 @@ Status Conv2dOperationProfiler::initialize_configuration(
|
||||
// cutlass profiler sets p and q which are cuDNN compliant. //
|
||||
// //
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
// set convolution output p
|
||||
// set convolution output p
|
||||
if (!arg_as_int(problem_.p, "p", problem_space, problem)) {
|
||||
// default value (set using cudnn formula for output height, when p is not provided)
|
||||
problem_.p = (
|
||||
problem_.h +
|
||||
2 * problem_.pad_h -
|
||||
problem_.h +
|
||||
2 * problem_.pad_h -
|
||||
((problem_.r - 1) * problem_.dilation_h + 1)
|
||||
) / (problem_.stride_h)
|
||||
) / (problem_.stride_h)
|
||||
+ 1;
|
||||
}
|
||||
|
||||
@ -295,10 +295,10 @@ Status Conv2dOperationProfiler::initialize_configuration(
|
||||
if (!arg_as_int(problem_.q, "q", problem_space, problem)) {
|
||||
// default value (set using cudnn formula for output width, when q is not provided)
|
||||
problem_.q = (
|
||||
problem_.w +
|
||||
2 * problem_.pad_w -
|
||||
problem_.w +
|
||||
2 * problem_.pad_w -
|
||||
((problem_.s - 1) * problem_.dilation_w + 1)
|
||||
) / (problem_.stride_w)
|
||||
) / (problem_.stride_w)
|
||||
+ 1;
|
||||
}
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -313,7 +313,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
|
||||
// default value
|
||||
problem_.split_k_slices = 1;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_ConvModeID(problem_.conv_mode, "conv_mode", problem_space, problem)) {
|
||||
// default value
|
||||
problem_.conv_mode = library::ConvModeID::kCrossCorrelation;
|
||||
@ -345,24 +345,24 @@ Status Conv2dOperationProfiler::initialize_configuration(
|
||||
}
|
||||
|
||||
if (!arg_as_scalar(
|
||||
problem_.alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
problem_.alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_scalar(
|
||||
problem_.beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
problem_.beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
|
||||
if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
@ -389,7 +389,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
|
||||
int(problem_.split_k_slices),
|
||||
int(problem_.groups)
|
||||
);
|
||||
|
||||
|
||||
conv_workspace_.configuration.split_k_mode = static_cast<conv::SplitKMode>(static_cast<int>(problem_.split_k_mode));
|
||||
|
||||
conv_workspace_.set_stride_vector(
|
||||
@ -420,7 +420,7 @@ Status Conv2dOperationProfiler::initialize_configuration(
|
||||
/// Initializes the performance result
|
||||
void Conv2dOperationProfiler::initialize_result_(
|
||||
PerformanceResult &result,
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
library::ConvDescription const &operation_desc,
|
||||
ProblemSpace const &problem_space) {
|
||||
|
||||
@ -432,15 +432,15 @@ void Conv2dOperationProfiler::initialize_result_(
|
||||
result.arguments.resize(problem_space.rank());
|
||||
|
||||
set_argument(result, "Activation", problem_space,
|
||||
std::string(library::to_string(operation_desc.activation().element))
|
||||
std::string(library::to_string(operation_desc.activation().element))
|
||||
+ ":" + library::to_string(operation_desc.activation().layout));
|
||||
|
||||
set_argument(result, "Filter", problem_space,
|
||||
std::string(library::to_string(operation_desc.filter().element))
|
||||
std::string(library::to_string(operation_desc.filter().element))
|
||||
+ ":" + library::to_string(operation_desc.filter().layout));
|
||||
|
||||
set_argument(result, "Output", problem_space,
|
||||
std::string(library::to_string(operation_desc.output().element))
|
||||
std::string(library::to_string(operation_desc.output().element))
|
||||
+ ":" + library::to_string(operation_desc.output().layout));
|
||||
|
||||
set_argument(result, "conv_kind", problem_space, library::to_string(operation_desc.conv_kind));
|
||||
@ -455,7 +455,7 @@ void Conv2dOperationProfiler::initialize_result_(
|
||||
set_argument(result, "k", problem_space, problem_.k);
|
||||
set_argument(result, "r", problem_space, problem_.r);
|
||||
set_argument(result, "s", problem_space, problem_.s);
|
||||
|
||||
|
||||
set_argument(result, "p", problem_space, problem_.p);
|
||||
set_argument(result, "q", problem_space, problem_.q);
|
||||
|
||||
@ -470,11 +470,11 @@ void Conv2dOperationProfiler::initialize_result_(
|
||||
set_argument(result, "dilation_h", problem_space, problem_.dilation_h);
|
||||
set_argument(result, "dilation_w", problem_space, problem_.dilation_w);
|
||||
|
||||
set_argument(result, "split_k_mode", problem_space,
|
||||
set_argument(result, "split_k_mode", problem_space,
|
||||
std::string(library::to_string(problem_.split_k_mode)));
|
||||
set_argument(result, "split_k_slices", problem_space, problem_.split_k_slices);
|
||||
|
||||
set_argument(result, "conv_mode", problem_space,
|
||||
set_argument(result, "conv_mode", problem_space,
|
||||
std::string(library::to_string(problem_.conv_mode)));
|
||||
|
||||
set_argument(result, "alpha", problem_space,
|
||||
@ -483,19 +483,19 @@ void Conv2dOperationProfiler::initialize_result_(
|
||||
set_argument(result, "beta", problem_space,
|
||||
library::lexical_cast(problem_.beta, operation_desc.element_epilogue));
|
||||
|
||||
set_argument(result, "eq_gemm_provider", problem_space,
|
||||
set_argument(result, "eq_gemm_provider", problem_space,
|
||||
std::string(library::to_string(problem_.eq_gemm_provider)));
|
||||
|
||||
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
|
||||
|
||||
// Bytes of activation, filter, and output tensors
|
||||
int64_t activation_bytes = int64_t(library::sizeof_bits(operation_desc.activation().element) / 8) *
|
||||
int64_t activation_bytes = int64_t(library::sizeof_bits(operation_desc.activation().element) / 8) *
|
||||
conv_workspace_.configuration.problem_size.activation_size();
|
||||
|
||||
int64_t filter_bytes = int64_t(library::sizeof_bits(operation_desc.filter().element) / 8) *
|
||||
int64_t filter_bytes = int64_t(library::sizeof_bits(operation_desc.filter().element) / 8) *
|
||||
conv_workspace_.configuration.problem_size.filter_size();
|
||||
|
||||
int64_t output_bytes = int64_t(library::sizeof_bits(operation_desc.output().element) / 8) *
|
||||
int64_t output_bytes = int64_t(library::sizeof_bits(operation_desc.output().element) / 8) *
|
||||
conv_workspace_.configuration.problem_size.output_size();
|
||||
|
||||
// Bytes of activation, filter, and output tensors
|
||||
@ -511,14 +511,14 @@ void Conv2dOperationProfiler::initialize_result_(
|
||||
|
||||
/// Initialize reduction problem dimensions and library::Operation
|
||||
bool Conv2dOperationProfiler::initialize_reduction_configuration_(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::ConvDescription const &conv_desc =
|
||||
library::ConvDescription const &conv_desc =
|
||||
static_cast<library::ConvDescription const &>(operation->description());
|
||||
|
||||
library::ConvKind const &conv_kind = conv_desc.conv_kind;
|
||||
@ -545,14 +545,14 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
|
||||
conv_workspace_.reduction_configuration.ldd =
|
||||
conv_workspace_.configuration.stride_c[tensor_c_stride_idx];
|
||||
|
||||
// find reduction operation
|
||||
// find reduction operation
|
||||
library::ReductionFunctionalKey reduction_key(
|
||||
library::Provider::kCUTLASS,
|
||||
conv_desc.tile_description.math_instruction.element_accumulator, // element workspace
|
||||
conv_desc.tile_description.math_instruction.element_accumulator, // element workspace
|
||||
conv_desc.tile_description.math_instruction.element_accumulator, // element accumulator
|
||||
conv_desc.C.element, // element output
|
||||
conv_desc.element_epilogue // element compute
|
||||
);
|
||||
);
|
||||
|
||||
#if 0// debug print to check which reduction instance is selected
|
||||
std::cout << reduction_key << "\n";
|
||||
@ -562,7 +562,7 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
|
||||
if(reduction_it == Singleton::get().operation_table.reduction_operations.end()) {
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// initialize reduction operation required for parallel split-k conv2d operator
|
||||
reduction_op_ = reduction_it->second;
|
||||
@ -574,13 +574,24 @@ bool Conv2dOperationProfiler::initialize_reduction_configuration_(
|
||||
|
||||
/// Initializes workspace
|
||||
Status Conv2dOperationProfiler::initialize_workspace(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
if (options.device.devices.size() != 1) {
|
||||
throw std::runtime_error("This operation profiler only supports a single "
|
||||
"device.");
|
||||
}
|
||||
|
||||
cudaError_t result;
|
||||
result = cudaSetDevice(options.device.device_id(0));
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaSetDevice() failed.");
|
||||
}
|
||||
|
||||
// initialize conv2d underlying operation to handle parallel reduction
|
||||
library::Operation const* underlying_operation = operation;
|
||||
|
||||
@ -590,15 +601,15 @@ Status Conv2dOperationProfiler::initialize_workspace(
|
||||
}
|
||||
}
|
||||
|
||||
library::ConvDescription const &operation_desc =
|
||||
library::ConvDescription const &operation_desc =
|
||||
static_cast<library::ConvDescription const &>(underlying_operation->description());
|
||||
|
||||
// Compute the number of copies of the problem to avoid L2 camping.
|
||||
if (!options.profiling.workspace_count) {
|
||||
int64_t bytes = problem_.bytes(operation_desc);
|
||||
if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
|
||||
if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
|
||||
conv_workspace_.problem_count =
|
||||
1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
|
||||
1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
|
||||
}
|
||||
else {
|
||||
conv_workspace_.problem_count = 1;
|
||||
@ -611,7 +622,7 @@ Status Conv2dOperationProfiler::initialize_workspace(
|
||||
|
||||
if (options.execution_mode != ExecutionMode::kDryRun) {
|
||||
int seed_shift = 0;
|
||||
conv_workspace_.A = device_context.allocate_tensor(
|
||||
conv_workspace_.A = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"A",
|
||||
operation_desc.A.element,
|
||||
@ -619,10 +630,11 @@ Status Conv2dOperationProfiler::initialize_workspace(
|
||||
problem_.extent_a(operation_desc.conv_kind),
|
||||
conv_workspace_.configuration.stride_a,
|
||||
conv_workspace_.problem_count,
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
conv_workspace_.B = device_context.allocate_tensor(
|
||||
conv_workspace_.B = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"B",
|
||||
operation_desc.B.element,
|
||||
@ -630,12 +642,13 @@ Status Conv2dOperationProfiler::initialize_workspace(
|
||||
problem_.extent_b(operation_desc.conv_kind),
|
||||
conv_workspace_.configuration.stride_b,
|
||||
conv_workspace_.problem_count,
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
if(problem_.groups == problem_.c && problem_.groups == problem_.k){
|
||||
// Depthwise direct conv kernel needs reorder the filter.
|
||||
conv_workspace_.reordered_B = device_context.allocate_tensor(
|
||||
conv_workspace_.reordered_B = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"B",
|
||||
operation_desc.B.element,
|
||||
@ -643,11 +656,12 @@ Status Conv2dOperationProfiler::initialize_workspace(
|
||||
problem_.extent_b(operation_desc.conv_kind),
|
||||
conv_workspace_.configuration.stride_b,
|
||||
conv_workspace_.problem_count,
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
}
|
||||
|
||||
conv_workspace_.C = device_context.allocate_tensor(
|
||||
conv_workspace_.C = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"C",
|
||||
operation_desc.C.element,
|
||||
@ -655,25 +669,30 @@ Status Conv2dOperationProfiler::initialize_workspace(
|
||||
problem_.extent_c(operation_desc.conv_kind),
|
||||
conv_workspace_.configuration.stride_c,
|
||||
conv_workspace_.problem_count,
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
conv_workspace_.Computed = device_context.allocate_tensor(
|
||||
options,
|
||||
"D",
|
||||
operation_desc.C.element,
|
||||
operation_desc.C.layout,
|
||||
problem_.extent_c(operation_desc.conv_kind),
|
||||
conv_workspace_.configuration.stride_c,
|
||||
conv_workspace_.problem_count
|
||||
conv_workspace_.problem_count,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
conv_workspace_.Reference = device_context.allocate_tensor(
|
||||
options,
|
||||
"Reference",
|
||||
operation_desc.C.element,
|
||||
operation_desc.C.layout,
|
||||
problem_.extent_c(operation_desc.conv_kind),
|
||||
conv_workspace_.configuration.stride_c,
|
||||
conv_workspace_.problem_count
|
||||
conv_workspace_.problem_count,
|
||||
0 // device_index
|
||||
);
|
||||
}
|
||||
|
||||
@ -706,10 +725,10 @@ Status Conv2dOperationProfiler::initialize_workspace(
|
||||
conv_workspace_.reduction_host_workspace.resize(workspace_size, 0);
|
||||
|
||||
status = reduction_op_->initialize(
|
||||
&conv_workspace_.reduction_configuration,
|
||||
conv_workspace_.reduction_host_workspace.data(),
|
||||
&conv_workspace_.reduction_configuration,
|
||||
conv_workspace_.reduction_host_workspace.data(),
|
||||
nullptr);
|
||||
|
||||
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
@ -736,7 +755,7 @@ Status Conv2dOperationProfiler::initialize_workspace(
|
||||
|
||||
/// Verifies CUTLASS against references
|
||||
bool Conv2dOperationProfiler::verify_cutlass(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -769,7 +788,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
|
||||
}
|
||||
|
||||
conv_workspace_.Computed->copy_from_device(conv_workspace_.C->data());
|
||||
|
||||
|
||||
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
|
||||
// update library::ConvArguments for parallel split-k reduction
|
||||
conv_workspace_.arguments.D = conv_workspace_.device_workspace.data();
|
||||
@ -799,9 +818,9 @@ bool Conv2dOperationProfiler::verify_cutlass(
|
||||
}
|
||||
|
||||
#if 0
|
||||
std::cout << "profiling : " << std::endl
|
||||
<< "conv2d : " << operation->description().name << std::endl
|
||||
<< "underlying conv2d : " << underlying_operation->description().name << std::endl
|
||||
std::cout << "profiling : " << std::endl
|
||||
<< "conv2d : " << operation->description().name << std::endl
|
||||
<< "underlying conv2d : " << underlying_operation->description().name << std::endl
|
||||
<< "reduction : " << reduction_op_->description().name << std::endl;
|
||||
#endif
|
||||
|
||||
@ -818,7 +837,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
|
||||
|
||||
// Run parallel reduction kernel for parallel split_k_mode
|
||||
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
|
||||
|
||||
|
||||
results_.back().status = reduction_op_->run(
|
||||
&conv_workspace_.reduction_arguments,
|
||||
conv_workspace_.reduction_host_workspace.data(),
|
||||
@ -840,7 +859,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
|
||||
|
||||
// CUTLASS op ran the but not yet verified against any verification provider
|
||||
results_.back().disposition = Disposition::kNotVerified;
|
||||
|
||||
|
||||
//
|
||||
// Run verification providers
|
||||
//
|
||||
@ -856,7 +875,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
|
||||
|
||||
Status status = cudnn_satisfies(conv_desc, conv_workspace_.configuration);
|
||||
|
||||
// Initialize reference data to the source data
|
||||
// Initialize reference data to the source data
|
||||
conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
|
||||
|
||||
if (status == Status::kSuccess) {
|
||||
@ -884,7 +903,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
|
||||
// Run verification device reference
|
||||
if (options.verification.provider_enabled(library::Provider::kReferenceDevice)) {
|
||||
|
||||
// Restore reference data back to initial source data
|
||||
// Restore reference data back to initial source data
|
||||
conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
|
||||
|
||||
verify_with_device_reference_(
|
||||
@ -893,13 +912,13 @@ bool Conv2dOperationProfiler::verify_cutlass(
|
||||
device_context,
|
||||
operation,
|
||||
problem_space,
|
||||
problem);
|
||||
problem);
|
||||
}
|
||||
|
||||
// Run verification host reference
|
||||
if (options.verification.provider_enabled(library::Provider::kReferenceHost)) {
|
||||
|
||||
// Restore reference data back to initial source data
|
||||
|
||||
// Restore reference data back to initial source data
|
||||
conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
|
||||
|
||||
verify_with_host_reference_(
|
||||
@ -908,10 +927,10 @@ bool Conv2dOperationProfiler::verify_cutlass(
|
||||
device_context,
|
||||
operation,
|
||||
problem_space,
|
||||
problem);
|
||||
problem);
|
||||
}
|
||||
|
||||
// Update disposition to worst case verification outcome among all
|
||||
// Update disposition to worst case verification outcome among all
|
||||
// verification providers which are supported
|
||||
bool is_any_verification_run_passed = false;
|
||||
for(auto &m : results_.back().verification_map) {
|
||||
@ -936,7 +955,7 @@ bool Conv2dOperationProfiler::verify_cutlass(
|
||||
|
||||
/// Verifies CUTLASS against host reference
|
||||
bool Conv2dOperationProfiler::verify_with_host_reference_(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -954,14 +973,14 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
|
||||
|
||||
library::ConvFunctionalKey conv2d_key(
|
||||
library::Provider::kReferenceHost,
|
||||
conv_desc.conv_kind,
|
||||
conv_desc.conv_kind,
|
||||
conv_desc.A.element,
|
||||
conv_desc.A.layout,
|
||||
conv_desc.B.element,
|
||||
conv_desc.B.layout,
|
||||
conv_desc.C.element,
|
||||
conv_desc.C.layout,
|
||||
conv_desc.tile_description.math_instruction.element_accumulator,
|
||||
conv_desc.tile_description.math_instruction.element_accumulator,
|
||||
conv_desc.element_epilogue);
|
||||
|
||||
#if 0 // debug print to check which host reference instance is selected
|
||||
@ -974,12 +993,12 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
|
||||
|
||||
results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// conv2d host reference minimum cc is 0 (CPU) and no iterator algorithm
|
||||
library::ConvPreferenceKey preference_key(0, library::IteratorAlgorithmID::kNone);
|
||||
auto cc_it = operators_it->second.find(preference_key);
|
||||
|
||||
|
||||
if(cc_it == operators_it->second.end()) {
|
||||
results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
|
||||
return true;
|
||||
@ -1052,9 +1071,9 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
|
||||
);
|
||||
|
||||
// Save workspace if incorrect
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) {
|
||||
|
||||
|
||||
save_workspace(
|
||||
device_context,
|
||||
options,
|
||||
@ -1070,7 +1089,7 @@ bool Conv2dOperationProfiler::verify_with_host_reference_(
|
||||
|
||||
/// Verifies CUTLASS against host reference
|
||||
bool Conv2dOperationProfiler::verify_with_device_reference_(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -1088,14 +1107,14 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
|
||||
|
||||
library::ConvFunctionalKey conv2d_key(
|
||||
library::Provider::kReferenceDevice,
|
||||
conv_desc.conv_kind,
|
||||
conv_desc.conv_kind,
|
||||
conv_desc.A.element,
|
||||
conv_desc.A.layout,
|
||||
conv_desc.B.element,
|
||||
conv_desc.B.layout,
|
||||
conv_desc.C.element,
|
||||
conv_desc.C.layout,
|
||||
conv_desc.tile_description.math_instruction.element_accumulator,
|
||||
conv_desc.tile_description.math_instruction.element_accumulator,
|
||||
conv_desc.element_epilogue);
|
||||
|
||||
auto operators_it = Singleton::get().operation_table.conv2d_operations.find(conv2d_key);
|
||||
@ -1105,12 +1124,12 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
|
||||
results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotRun;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// conv2d device reference minimum cc is 50 and no iterator algorithm
|
||||
library::ConvPreferenceKey preference_key(50, library::IteratorAlgorithmID::kNone);
|
||||
auto cc_it = operators_it->second.find(preference_key);
|
||||
|
||||
|
||||
if(cc_it == operators_it->second.end()) {
|
||||
results_.back().verification_map[library::Provider::kReferenceDevice] = Disposition::kNotRun;
|
||||
|
||||
@ -1119,7 +1138,7 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
|
||||
|
||||
// device reference has only one instances in Conv2dOperationVectorMap
|
||||
library::Operation const *reference_op = cc_it->second[0];
|
||||
|
||||
|
||||
//
|
||||
// Initialize device reference operation
|
||||
//
|
||||
@ -1166,9 +1185,9 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
|
||||
);
|
||||
|
||||
// Save workspace if incorrect
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
results_.back().verification_map[library::Provider::kReferenceDevice] == Disposition::kIncorrect) {
|
||||
|
||||
|
||||
save_workspace(
|
||||
device_context,
|
||||
options,
|
||||
@ -1183,14 +1202,14 @@ bool Conv2dOperationProfiler::verify_with_device_reference_(
|
||||
|
||||
/// Measures performance results
|
||||
bool Conv2dOperationProfiler::profile(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
|
||||
|
||||
if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
|
||||
|
||||
// Initialize structure containing Conv2d arguments
|
||||
@ -1242,7 +1261,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
|
||||
GpuTimer timer;
|
||||
|
||||
// initialize conv2d underlying operation to handle parallel reduction
|
||||
library::Operation const* underlying_operation = operation;
|
||||
library::Operation const* underlying_operation = operation;
|
||||
|
||||
library::ConvArguments *conv_arguments = static_cast<library::ConvArguments *>(arguments);
|
||||
|
||||
@ -1274,7 +1293,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
|
||||
conv_arguments->B = conv_workspace_.B->batch_data(problem_idx);
|
||||
conv_arguments->C = conv_workspace_.C->batch_data(problem_idx);
|
||||
conv_arguments->D = conv_workspace_.Computed->batch_data(problem_idx);
|
||||
|
||||
|
||||
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
|
||||
// update library::ConvArguments for parallel split-k reduction
|
||||
conv_arguments->D = conv_workspace_.device_workspace.data();
|
||||
@ -1304,7 +1323,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Initialize GPU timer
|
||||
//
|
||||
@ -1319,7 +1338,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
|
||||
|
||||
int iteration = 0;
|
||||
for (; iteration < Iterations; ++iteration) {
|
||||
|
||||
|
||||
// Setup rotating workspace
|
||||
int problem_idx = (iteration % conv_workspace_.problem_count);
|
||||
|
||||
@ -1345,7 +1364,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
|
||||
device_workspace);
|
||||
|
||||
// Run parallel reduction kernel for parallel split_k_mode
|
||||
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
|
||||
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
|
||||
|
||||
status = reduction_op_->run(
|
||||
&conv_workspace_.reduction_arguments,
|
||||
@ -1367,7 +1386,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
|
||||
//
|
||||
// Update performance result
|
||||
//
|
||||
|
||||
|
||||
runtime = timer.duration(iteration);
|
||||
|
||||
return status;
|
||||
@ -1378,13 +1397,13 @@ Status Conv2dOperationProfiler::profile_cutlass_(
|
||||
|
||||
/// Verifies CUTLASS against cudnn reference
|
||||
bool Conv2dOperationProfiler::verify_with_cudnn_(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
|
||||
auto &conv_desc = static_cast<library::ConvDescription const &>(operation->description());
|
||||
|
||||
//
|
||||
@ -1395,7 +1414,7 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
|
||||
cudnnStatus_t status = handle.get_cudnn_create_status();
|
||||
|
||||
if (status != CUDNN_STATUS_SUCCESS) {
|
||||
|
||||
|
||||
results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status);
|
||||
return true;
|
||||
}
|
||||
@ -1411,7 +1430,7 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
|
||||
conv_workspace_.arguments.alpha = problem_.alpha.data();
|
||||
conv_workspace_.arguments.beta = problem_.beta.data();
|
||||
conv_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
|
||||
|
||||
|
||||
// cuDNN does not support four tensor arguments, so we copy the tensor C data into
|
||||
// tensor D.
|
||||
conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
|
||||
@ -1423,8 +1442,8 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
|
||||
// Construct dispatcher to cudnn operator
|
||||
//
|
||||
|
||||
detail::cudnnConvDispatcher conv_op(
|
||||
conv_desc,
|
||||
detail::cudnnConvDispatcher conv_op(
|
||||
conv_desc,
|
||||
conv_workspace_.configuration,
|
||||
conv_workspace_.arguments,
|
||||
handle
|
||||
@ -1462,7 +1481,7 @@ bool Conv2dOperationProfiler::verify_with_cudnn_(
|
||||
);
|
||||
|
||||
// Save workspace if incorrect
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
results_.back().verification_map[library::Provider::kCUDNN] == Disposition::kIncorrect) {
|
||||
|
||||
save_workspace(
|
||||
|
||||
@ -52,10 +52,10 @@ namespace profiler {
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Ctor
|
||||
Conv3dOperationProfiler::Conv3dOperationProfiler(Options const &options):
|
||||
Conv3dOperationProfiler::Conv3dOperationProfiler(Options const &options):
|
||||
OperationProfiler(
|
||||
options,
|
||||
library::OperationKind::kConv3d,
|
||||
library::OperationKind::kConv3d,
|
||||
{
|
||||
{ArgumentTypeID::kEnumerated, {"conv_kind"}, "Convolutional operator (fprop, dgrad, wgrad)"},
|
||||
{ArgumentTypeID::kInteger, {"n", "input_n"}, "Input N dimension of the Conv3d problem space"},
|
||||
@ -170,7 +170,7 @@ int64_t Conv3dOperationProfiler::Conv3dProblem::flops(
|
||||
|
||||
int64_t flops_mainloop_ = int64_t(mnk.m()) * mnk.n() * mnk.k() * 2;
|
||||
int64_t flops_epilogue_ = int64_t(mnk.m()) * int64_t(mnk.n()) * 2;
|
||||
|
||||
|
||||
// Adjust mainloop flop for dgrad strided
|
||||
if (operation_desc.conv_kind == library::ConvKind::kDgrad) {
|
||||
flops_mainloop_ = flops_mainloop_ / ( stride_d * stride_h * stride_w);
|
||||
@ -183,14 +183,14 @@ int64_t Conv3dOperationProfiler::Conv3dProblem::flops(
|
||||
|
||||
/// Extracts the problem dimensions
|
||||
Status Conv3dOperationProfiler::initialize_configuration(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::ConvDescription const &operation_desc =
|
||||
library::ConvDescription const &operation_desc =
|
||||
static_cast<library::ConvDescription const &>(operation->description());
|
||||
|
||||
if (!arg_as_int(problem_.n, "n", problem_space, problem)) {
|
||||
@ -207,7 +207,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
|
||||
// default value
|
||||
problem_.h = 14;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(problem_.w, "w", problem_space, problem)) {
|
||||
// default value
|
||||
problem_.w = 14;
|
||||
@ -232,7 +232,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
|
||||
// default value
|
||||
problem_.r = 3;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(problem_.s, "s", problem_space, problem)) {
|
||||
// default value
|
||||
problem_.s = 3;
|
||||
@ -294,25 +294,25 @@ Status Conv3dOperationProfiler::initialize_configuration(
|
||||
// cutlass profiler sets p and q which are cuDNN compliant. //
|
||||
// //
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
// set convolution output z
|
||||
// set convolution output z
|
||||
if (!arg_as_int(problem_.z, "z", problem_space, problem)) {
|
||||
// default value (set using cudnn formula for output height, when p is not provided)
|
||||
problem_.z = (
|
||||
problem_.d +
|
||||
2 * problem_.pad_d -
|
||||
problem_.d +
|
||||
2 * problem_.pad_d -
|
||||
((problem_.t - 1) * problem_.dilation_d + 1)
|
||||
) / (problem_.stride_d)
|
||||
) / (problem_.stride_d)
|
||||
+ 1;
|
||||
}
|
||||
|
||||
// set convolution output p
|
||||
// set convolution output p
|
||||
if (!arg_as_int(problem_.p, "p", problem_space, problem)) {
|
||||
// default value (set using cudnn formula for output height, when p is not provided)
|
||||
problem_.p = (
|
||||
problem_.h +
|
||||
2 * problem_.pad_h -
|
||||
problem_.h +
|
||||
2 * problem_.pad_h -
|
||||
((problem_.r - 1) * problem_.dilation_h + 1)
|
||||
) / (problem_.stride_h)
|
||||
) / (problem_.stride_h)
|
||||
+ 1;
|
||||
}
|
||||
|
||||
@ -320,10 +320,10 @@ Status Conv3dOperationProfiler::initialize_configuration(
|
||||
if (!arg_as_int(problem_.q, "q", problem_space, problem)) {
|
||||
// default value (set using cudnn formula for output width, when q is not provided)
|
||||
problem_.q = (
|
||||
problem_.w +
|
||||
2 * problem_.pad_w -
|
||||
problem_.w +
|
||||
2 * problem_.pad_w -
|
||||
((problem_.s - 1) * problem_.dilation_w + 1)
|
||||
) / (problem_.stride_w)
|
||||
) / (problem_.stride_w)
|
||||
+ 1;
|
||||
}
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -338,7 +338,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
|
||||
// default value
|
||||
problem_.split_k_slices = 1;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_ConvModeID(problem_.conv_mode, "conv_mode", problem_space, problem)) {
|
||||
// default value
|
||||
problem_.conv_mode = library::ConvModeID::kCrossCorrelation;
|
||||
@ -370,24 +370,24 @@ Status Conv3dOperationProfiler::initialize_configuration(
|
||||
}
|
||||
|
||||
if (!arg_as_scalar(
|
||||
problem_.alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
problem_.alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_scalar(
|
||||
problem_.beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
problem_.beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
|
||||
if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
@ -420,25 +420,25 @@ Status Conv3dOperationProfiler::initialize_configuration(
|
||||
int(problem_.split_k_slices),
|
||||
1 // groups
|
||||
);
|
||||
|
||||
|
||||
conv_workspace_.configuration.split_k_mode = static_cast<conv::SplitKMode>(static_cast<int>(problem_.split_k_mode));
|
||||
|
||||
conv_workspace_.configuration.layout_activations.stride() = make_Coord(
|
||||
int(problem_.c),
|
||||
int(problem_.c),
|
||||
int(problem_.w) * int(problem_.c),
|
||||
int(problem_.h) * int(problem_.w) * int(problem_.c),
|
||||
int(problem_.d) * int(problem_.h) * int(problem_.w) * int(problem_.c)
|
||||
);
|
||||
|
||||
conv_workspace_.configuration.layout_filters.stride() = make_Coord(
|
||||
int(problem_.c),
|
||||
int(problem_.c),
|
||||
int(problem_.s) * int(problem_.c),
|
||||
int(problem_.r) * int(problem_.s) * int(problem_.c),
|
||||
int(problem_.t) * int(problem_.r) * int(problem_.s) * int(problem_.c)
|
||||
);
|
||||
|
||||
conv_workspace_.configuration.layout_output.stride() = make_Coord(
|
||||
int(problem_.k),
|
||||
int(problem_.k),
|
||||
int(problem_.q) * int(problem_.k),
|
||||
int(problem_.q) * int(problem_.p) * int(problem_.k),
|
||||
int(problem_.z) * int(problem_.q) * int(problem_.p) * int(problem_.k)
|
||||
@ -469,7 +469,7 @@ Status Conv3dOperationProfiler::initialize_configuration(
|
||||
/// Initializes the performance result
|
||||
void Conv3dOperationProfiler::initialize_result_(
|
||||
PerformanceResult &result,
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
library::ConvDescription const &operation_desc,
|
||||
ProblemSpace const &problem_space) {
|
||||
|
||||
@ -481,15 +481,15 @@ void Conv3dOperationProfiler::initialize_result_(
|
||||
result.arguments.resize(problem_space.rank());
|
||||
|
||||
set_argument(result, "Activation", problem_space,
|
||||
std::string(library::to_string(operation_desc.activation().element))
|
||||
std::string(library::to_string(operation_desc.activation().element))
|
||||
+ ":" + library::to_string(operation_desc.activation().layout));
|
||||
|
||||
set_argument(result, "Filter", problem_space,
|
||||
std::string(library::to_string(operation_desc.filter().element))
|
||||
std::string(library::to_string(operation_desc.filter().element))
|
||||
+ ":" + library::to_string(operation_desc.filter().layout));
|
||||
|
||||
set_argument(result, "Output", problem_space,
|
||||
std::string(library::to_string(operation_desc.output().element))
|
||||
std::string(library::to_string(operation_desc.output().element))
|
||||
+ ":" + library::to_string(operation_desc.output().layout));
|
||||
|
||||
set_argument(result, "conv_kind", problem_space, library::to_string(operation_desc.conv_kind));
|
||||
@ -506,7 +506,7 @@ void Conv3dOperationProfiler::initialize_result_(
|
||||
set_argument(result, "t", problem_space, problem_.t);
|
||||
set_argument(result, "r", problem_space, problem_.r);
|
||||
set_argument(result, "s", problem_space, problem_.s);
|
||||
|
||||
|
||||
set_argument(result, "z", problem_space, problem_.z);
|
||||
set_argument(result, "p", problem_space, problem_.p);
|
||||
set_argument(result, "q", problem_space, problem_.q);
|
||||
@ -523,11 +523,11 @@ void Conv3dOperationProfiler::initialize_result_(
|
||||
set_argument(result, "dilation_h", problem_space, problem_.dilation_h);
|
||||
set_argument(result, "dilation_w", problem_space, problem_.dilation_w);
|
||||
|
||||
set_argument(result, "split_k_mode", problem_space,
|
||||
set_argument(result, "split_k_mode", problem_space,
|
||||
std::string(library::to_string(problem_.split_k_mode)));
|
||||
set_argument(result, "split_k_slices", problem_space, problem_.split_k_slices);
|
||||
|
||||
set_argument(result, "conv_mode", problem_space,
|
||||
set_argument(result, "conv_mode", problem_space,
|
||||
std::string(library::to_string(problem_.conv_mode)));
|
||||
|
||||
set_argument(result, "alpha", problem_space,
|
||||
@ -536,7 +536,7 @@ void Conv3dOperationProfiler::initialize_result_(
|
||||
set_argument(result, "beta", problem_space,
|
||||
library::lexical_cast(problem_.beta, operation_desc.element_epilogue));
|
||||
|
||||
set_argument(result, "eq_gemm_provider", problem_space,
|
||||
set_argument(result, "eq_gemm_provider", problem_space,
|
||||
std::string(library::to_string(problem_.eq_gemm_provider)));
|
||||
|
||||
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
|
||||
@ -554,14 +554,14 @@ void Conv3dOperationProfiler::initialize_result_(
|
||||
|
||||
/// Initialize reduction problem dimensions and library::Operation
|
||||
bool Conv3dOperationProfiler::initialize_reduction_configuration_(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::ConvDescription const &conv_desc =
|
||||
library::ConvDescription const &conv_desc =
|
||||
static_cast<library::ConvDescription const &>(operation->description());
|
||||
|
||||
library::ConvKind const &conv_kind = conv_desc.conv_kind;
|
||||
@ -585,14 +585,14 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(
|
||||
conv_workspace_.reduction_configuration.lds = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
|
||||
conv_workspace_.reduction_configuration.ldd = conv_workspace_.configuration.layout_c(conv_kind).stride()[tensor_c_stride_idx];
|
||||
|
||||
// find reduction operation
|
||||
// find reduction operation
|
||||
library::ReductionFunctionalKey reduction_key(
|
||||
library::Provider::kCUTLASS,
|
||||
conv_desc.tile_description.math_instruction.element_accumulator, // element workspace
|
||||
conv_desc.tile_description.math_instruction.element_accumulator, // element workspace
|
||||
conv_desc.tile_description.math_instruction.element_accumulator, // element accumulator
|
||||
conv_desc.C.element, // element output
|
||||
conv_desc.element_epilogue // element compute
|
||||
);
|
||||
);
|
||||
|
||||
#if 0// debug print to check which reduction instance is selected
|
||||
std::cout << reduction_key << "\n";
|
||||
@ -602,7 +602,7 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(
|
||||
if(reduction_it == Singleton::get().operation_table.reduction_operations.end()) {
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// initialize reduction operation required for parallel split-k conv2d operator
|
||||
reduction_op_ = reduction_it->second;
|
||||
@ -614,13 +614,24 @@ bool Conv3dOperationProfiler::initialize_reduction_configuration_(
|
||||
|
||||
/// Initializes workspace
|
||||
Status Conv3dOperationProfiler::initialize_workspace(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
if (options.device.devices.size() != 1) {
|
||||
throw std::runtime_error("This operation profiler only supports a single "
|
||||
"device.");
|
||||
}
|
||||
|
||||
cudaError_t result;
|
||||
result = cudaSetDevice(options.device.device_id(0));
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaSetDevice() failed.");
|
||||
}
|
||||
|
||||
// initialize conv2d underlying operation to handle parallel reduction
|
||||
library::Operation const* underlying_operation = operation;
|
||||
|
||||
@ -630,15 +641,15 @@ Status Conv3dOperationProfiler::initialize_workspace(
|
||||
}
|
||||
}
|
||||
|
||||
library::ConvDescription const &operation_desc =
|
||||
library::ConvDescription const &operation_desc =
|
||||
static_cast<library::ConvDescription const &>(underlying_operation->description());
|
||||
|
||||
// Compute the number of copies of the problem to avoid L2 camping.
|
||||
if (!options.profiling.workspace_count) {
|
||||
int64_t bytes = problem_.bytes(operation_desc);
|
||||
if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
|
||||
if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
|
||||
conv_workspace_.problem_count =
|
||||
1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
|
||||
1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
|
||||
}
|
||||
else {
|
||||
conv_workspace_.problem_count = 1;
|
||||
@ -651,7 +662,7 @@ Status Conv3dOperationProfiler::initialize_workspace(
|
||||
|
||||
if (options.execution_mode != ExecutionMode::kDryRun) {
|
||||
int seed_shift = 0;
|
||||
conv_workspace_.A = device_context.allocate_tensor(
|
||||
conv_workspace_.A = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"A",
|
||||
operation_desc.A.element,
|
||||
@ -659,10 +670,11 @@ Status Conv3dOperationProfiler::initialize_workspace(
|
||||
problem_.extent_a(operation_desc.conv_kind),
|
||||
conv_workspace_.stride_a(operation_desc.conv_kind),
|
||||
conv_workspace_.problem_count,
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
conv_workspace_.B = device_context.allocate_tensor(
|
||||
conv_workspace_.B = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"B",
|
||||
operation_desc.B.element,
|
||||
@ -670,10 +682,11 @@ Status Conv3dOperationProfiler::initialize_workspace(
|
||||
problem_.extent_b(operation_desc.conv_kind),
|
||||
conv_workspace_.stride_b(operation_desc.conv_kind),
|
||||
conv_workspace_.problem_count,
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
conv_workspace_.C = device_context.allocate_tensor(
|
||||
conv_workspace_.C = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"C",
|
||||
operation_desc.C.element,
|
||||
@ -681,27 +694,32 @@ Status Conv3dOperationProfiler::initialize_workspace(
|
||||
problem_.extent_c(operation_desc.conv_kind),
|
||||
conv_workspace_.stride_c(operation_desc.conv_kind),
|
||||
conv_workspace_.problem_count,
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
conv_workspace_.Computed = device_context.allocate_tensor(
|
||||
options,
|
||||
"D",
|
||||
operation_desc.C.element,
|
||||
operation_desc.C.layout,
|
||||
problem_.extent_c(operation_desc.conv_kind),
|
||||
conv_workspace_.stride_c(operation_desc.conv_kind),
|
||||
conv_workspace_.problem_count
|
||||
conv_workspace_.problem_count,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
conv_workspace_.Reference = device_context.allocate_tensor(
|
||||
options,
|
||||
"Reference",
|
||||
operation_desc.C.element,
|
||||
operation_desc.C.layout,
|
||||
problem_.extent_c(operation_desc.conv_kind),
|
||||
conv_workspace_.stride_c(operation_desc.conv_kind),
|
||||
conv_workspace_.problem_count
|
||||
conv_workspace_.problem_count,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
@ -733,10 +751,10 @@ Status Conv3dOperationProfiler::initialize_workspace(
|
||||
conv_workspace_.reduction_host_workspace.resize(workspace_size, 0);
|
||||
|
||||
status = reduction_op_->initialize(
|
||||
&conv_workspace_.reduction_configuration,
|
||||
conv_workspace_.reduction_host_workspace.data(),
|
||||
&conv_workspace_.reduction_configuration,
|
||||
conv_workspace_.reduction_host_workspace.data(),
|
||||
nullptr);
|
||||
|
||||
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
@ -763,7 +781,7 @@ Status Conv3dOperationProfiler::initialize_workspace(
|
||||
|
||||
/// Verifies CUTLASS against references
|
||||
bool Conv3dOperationProfiler::verify_cutlass(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -784,7 +802,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
|
||||
set_cutlass_operator_arguments_();
|
||||
|
||||
conv_workspace_.Computed->copy_from_device(conv_workspace_.C->data());
|
||||
|
||||
|
||||
//
|
||||
// Run the CUTLASS operation
|
||||
//
|
||||
@ -799,9 +817,9 @@ bool Conv3dOperationProfiler::verify_cutlass(
|
||||
}
|
||||
|
||||
#if 0
|
||||
std::cout << "profiling : " << std::endl
|
||||
<< "conv2d : " << operation->description().name << std::endl
|
||||
<< "underlying conv2d : " << underlying_operation->description().name << std::endl
|
||||
std::cout << "profiling : " << std::endl
|
||||
<< "conv2d : " << operation->description().name << std::endl
|
||||
<< "underlying conv2d : " << underlying_operation->description().name << std::endl
|
||||
<< "reduction : " << reduction_op_->description().name << std::endl;
|
||||
#endif
|
||||
|
||||
@ -818,7 +836,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
|
||||
|
||||
// Run parallel reduction kernel for parallel split_k_mode
|
||||
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
|
||||
|
||||
|
||||
results_.back().status = reduction_op_->run(
|
||||
&conv_workspace_.reduction_arguments,
|
||||
conv_workspace_.reduction_host_workspace.data(),
|
||||
@ -840,7 +858,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
|
||||
|
||||
// CUTLASS op ran the but not yet verified against any verification provider
|
||||
results_.back().disposition = Disposition::kNotVerified;
|
||||
|
||||
|
||||
//
|
||||
// Run verification providers
|
||||
//
|
||||
@ -856,7 +874,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
|
||||
|
||||
Status status = cudnn_satisfies(conv_desc, conv_workspace_.configuration);
|
||||
|
||||
// Initialize reference data to the source data
|
||||
// Initialize reference data to the source data
|
||||
conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
|
||||
|
||||
if (status == Status::kSuccess) {
|
||||
@ -883,8 +901,8 @@ bool Conv3dOperationProfiler::verify_cutlass(
|
||||
|
||||
// Run verification host reference
|
||||
if (options.verification.provider_enabled(library::Provider::kReferenceHost)) {
|
||||
|
||||
// Restore reference data back to initial source data
|
||||
|
||||
// Restore reference data back to initial source data
|
||||
conv_workspace_.Reference->copy_from_device(conv_workspace_.C->data());
|
||||
|
||||
verify_with_host_reference_(
|
||||
@ -893,10 +911,10 @@ bool Conv3dOperationProfiler::verify_cutlass(
|
||||
device_context,
|
||||
operation,
|
||||
problem_space,
|
||||
problem);
|
||||
problem);
|
||||
}
|
||||
|
||||
// Update disposition to worst case verification outcome among all
|
||||
// Update disposition to worst case verification outcome among all
|
||||
// verification providers which are supported
|
||||
bool is_any_verification_run_passed = false;
|
||||
for(auto &m : results_.back().verification_map) {
|
||||
@ -921,7 +939,7 @@ bool Conv3dOperationProfiler::verify_cutlass(
|
||||
|
||||
/// Verifies CUTLASS against host reference
|
||||
bool Conv3dOperationProfiler::verify_with_host_reference_(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -939,14 +957,14 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
|
||||
|
||||
library::ConvFunctionalKey conv_key(
|
||||
library::Provider::kReferenceHost,
|
||||
conv_desc.conv_kind,
|
||||
conv_desc.conv_kind,
|
||||
conv_desc.A.element,
|
||||
conv_desc.A.layout,
|
||||
conv_desc.B.element,
|
||||
conv_desc.B.layout,
|
||||
conv_desc.C.element,
|
||||
conv_desc.C.layout,
|
||||
conv_desc.tile_description.math_instruction.element_accumulator,
|
||||
conv_desc.tile_description.math_instruction.element_accumulator,
|
||||
conv_desc.element_epilogue);
|
||||
|
||||
#if 0 // debug print to check which host reference instance is selected
|
||||
@ -959,12 +977,12 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
|
||||
|
||||
results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// conv3d host reference minimum cc is 0 (CPU) and no iterator algorithm
|
||||
library::ConvPreferenceKey preference_key(0, library::IteratorAlgorithmID::kNone);
|
||||
auto cc_it = operators_it->second.find(preference_key);
|
||||
|
||||
|
||||
if(cc_it == operators_it->second.end()) {
|
||||
results_.back().verification_map[library::Provider::kReferenceHost] = Disposition::kNotRun;
|
||||
return true;
|
||||
@ -1035,9 +1053,9 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
|
||||
);
|
||||
|
||||
// Save workspace if incorrect
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
results_.back().verification_map[library::Provider::kReferenceHost] == Disposition::kIncorrect) {
|
||||
|
||||
|
||||
save_workspace(
|
||||
device_context,
|
||||
options,
|
||||
@ -1053,7 +1071,7 @@ bool Conv3dOperationProfiler::verify_with_host_reference_(
|
||||
|
||||
/// Verifies CUTLASS against host reference
|
||||
bool Conv3dOperationProfiler::verify_with_device_reference_(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -1068,14 +1086,14 @@ bool Conv3dOperationProfiler::verify_with_device_reference_(
|
||||
|
||||
/// Measures performance results
|
||||
bool Conv3dOperationProfiler::profile(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
|
||||
|
||||
if (options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
|
||||
|
||||
set_cutlass_operator_arguments_();
|
||||
@ -1180,7 +1198,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Initialize GPU timer
|
||||
//
|
||||
@ -1198,9 +1216,9 @@ Status Conv3dOperationProfiler::profile_cutlass_(
|
||||
|
||||
// Setup rotating workspace
|
||||
int problem_idx = (iteration % conv_workspace_.problem_count);
|
||||
|
||||
|
||||
set_cutlass_operator_arguments_(problem_idx);
|
||||
|
||||
|
||||
// Run underlying conv2d operation
|
||||
status = underlying_operation->run(
|
||||
arguments,
|
||||
@ -1208,7 +1226,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
|
||||
device_workspace);
|
||||
|
||||
// Run parallel reduction kernel for parallel split_k_mode
|
||||
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
|
||||
if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
|
||||
status = reduction_op_->run(
|
||||
&conv_workspace_.reduction_arguments,
|
||||
conv_workspace_.reduction_host_workspace.data(),
|
||||
@ -1229,7 +1247,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
|
||||
//
|
||||
// Update performance result
|
||||
//
|
||||
|
||||
|
||||
runtime = timer.duration(iteration);
|
||||
|
||||
return status;
|
||||
@ -1240,7 +1258,7 @@ Status Conv3dOperationProfiler::profile_cutlass_(
|
||||
|
||||
/// Verifies CUTLASS against cudnn reference
|
||||
bool Conv3dOperationProfiler::verify_with_cudnn_(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -1257,7 +1275,7 @@ bool Conv3dOperationProfiler::verify_with_cudnn_(
|
||||
cudnnStatus_t status = handle.get_cudnn_create_status();
|
||||
|
||||
if (status != CUDNN_STATUS_SUCCESS) {
|
||||
|
||||
|
||||
results_.back().verification_map[library::Provider::kCUDNN] = get_cutlass_disposition(status);
|
||||
return true;
|
||||
}
|
||||
@ -1285,8 +1303,8 @@ bool Conv3dOperationProfiler::verify_with_cudnn_(
|
||||
// Construct dispatcher to cudnn operator
|
||||
//
|
||||
|
||||
detail::cudnnConvDispatcher conv_op(
|
||||
conv_desc,
|
||||
detail::cudnnConvDispatcher conv_op(
|
||||
conv_desc,
|
||||
conv_workspace_.configuration,
|
||||
conv_workspace_.arguments,
|
||||
handle
|
||||
@ -1323,7 +1341,7 @@ bool Conv3dOperationProfiler::verify_with_cudnn_(
|
||||
);
|
||||
|
||||
// Save workspace if incorrect
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
results_.back().verification_map[library::Provider::kCUDNN] == Disposition::kIncorrect) {
|
||||
|
||||
save_workspace(
|
||||
|
||||
@ -259,6 +259,25 @@ Status cublas_satisfies(library::GemmDescription const &desc) {
|
||||
return Status::kErrorNotSupported;
|
||||
}
|
||||
|
||||
// Refer to https://docs.nvidia.com/cuda/cublas/#id105
|
||||
// input type A and B FE5M2 not supported in cuBLASLt
|
||||
if(desc.A.element == library::NumericTypeID::kFE5M2 &&
|
||||
desc.B.element == library::NumericTypeID::kFE5M2){
|
||||
|
||||
return Status::kErrorNotSupported;
|
||||
}
|
||||
|
||||
// Refer to https://docs.nvidia.com/cuda/cublas/#id105
|
||||
// input type A and B are FE5M2 and FE4M3 then D type should be F32
|
||||
if (desc.A.element == library::NumericTypeID::kFE5M2 &&
|
||||
desc.B.element == library::NumericTypeID::kFE4M3 &&
|
||||
desc.C.element == library::NumericTypeID::kF32 &&
|
||||
desc.D.element != library::NumericTypeID::kF32 ){
|
||||
|
||||
return Status::kErrorNotSupported;
|
||||
}
|
||||
|
||||
|
||||
// output type S4 and S8 not supported in cuBLAS
|
||||
if (desc.C.element == library::NumericTypeID::kS4 ||
|
||||
desc.C.element == library::NumericTypeID::kS8) {
|
||||
@ -405,7 +424,261 @@ cublasStatus_t cublasGemmExDispatcher::operator()(cublasHandle_t handle) {
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
cublasLtGemmExDispatcher::cublasLtGemmExDispatcher(
|
||||
library::GemmDescription const &op_desc,
|
||||
library::GemmUniversalConfiguration configuration_,
|
||||
library::GemmUniversalArguments arguments_
|
||||
):
|
||||
op_desc(op_desc), configuration(configuration_), arguments(arguments_), status(Status::kSuccess) {
|
||||
|
||||
bool good = true;
|
||||
|
||||
good = (good && get_cublas_transpose_operation(trans_A, op_desc.A.layout, op_desc.transform_A));
|
||||
good = (good && get_cublas_transpose_operation(trans_B, op_desc.B.layout, op_desc.transform_B));
|
||||
good = (good && get_cublas_datatype(data_type_A, op_desc.A.element));
|
||||
good = (good && get_cublas_datatype(data_type_B, op_desc.B.element));
|
||||
good = (good && get_cublas_datatype(data_type_C, op_desc.C.element));
|
||||
|
||||
good = (good && get_cublas_datatype(
|
||||
compute_data_type,
|
||||
op_desc.tile_description.math_instruction.element_accumulator));
|
||||
|
||||
// cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe
|
||||
// internal numerical data types used in the computation.
|
||||
#if (__CUDACC_VER_MAJOR__ >= 11)
|
||||
library::OpcodeClassID const & opcode_class =
|
||||
op_desc.tile_description.math_instruction.opcode_class;
|
||||
|
||||
if (good &&
|
||||
op_desc.A.element == library::NumericTypeID::kF32 &&
|
||||
op_desc.B.element == library::NumericTypeID::kF32 &&
|
||||
opcode_class == library::OpcodeClassID::kTensorOp) {
|
||||
|
||||
compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||
}
|
||||
else if (good) {
|
||||
bool const isPedantic = false;
|
||||
switch (compute_data_type) {
|
||||
case CUDA_R_32F:
|
||||
case CUDA_C_32F:
|
||||
compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
|
||||
break;
|
||||
case CUDA_R_64F:
|
||||
case CUDA_C_64F:
|
||||
compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
|
||||
break;
|
||||
case CUDA_R_16F:
|
||||
compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
|
||||
break;
|
||||
case CUDA_R_32I:
|
||||
compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
|
||||
break;
|
||||
default:
|
||||
good = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif // __CUDACC_VER_MAJOR__ >= 11
|
||||
|
||||
if (!good) {
|
||||
status = Status::kErrorNotSupported;
|
||||
}
|
||||
}
|
||||
|
||||
void cublasLtGemmExDispatcher::initialize_cublaslt(){
|
||||
|
||||
// create operation desciriptor; see cublasLtMatmulDescAttributes_t for details about defaults; here we just need to
|
||||
// set the transforms for A and B
|
||||
cublasLtMatmulDescCreate(&operationDesc, compute_type, compute_data_type);
|
||||
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_A, sizeof(trans_A));
|
||||
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_B, sizeof(trans_B));
|
||||
|
||||
uint64_t contiguous_A = (trans_A == CUBLAS_OP_N ? configuration.problem_size.m() : configuration.problem_size.k());
|
||||
uint64_t strided_A = (trans_A == CUBLAS_OP_N ? configuration.problem_size.k() : configuration.problem_size.m());
|
||||
uint64_t contiguous_B = (trans_B == CUBLAS_OP_N ? configuration.problem_size.k() : configuration.problem_size.n());
|
||||
uint64_t strided_B = (trans_B == CUBLAS_OP_N ? configuration.problem_size.n() : configuration.problem_size.k());
|
||||
|
||||
// create matrix descriptors, we are good with the details here so no need to set any extra attributes
|
||||
// table of supported type combinations can be found in the documentation: https://docs.nvidia.com/cuda/cublas/index.html#cublasltmatmul
|
||||
cublasLtMatrixLayoutCreate(&Adesc, data_type_A, contiguous_A, strided_A, configuration.lda);
|
||||
cublasLtMatrixLayoutCreate(&Bdesc, data_type_B, contiguous_B, strided_B, configuration.ldb);
|
||||
cublasLtMatrixLayoutCreate(&Cdesc, data_type_C, configuration.problem_size.m(), configuration.problem_size.n(), configuration.ldc);
|
||||
cublasLtMatrixLayoutCreate(&Ddesc, data_type_C, configuration.problem_size.m(), configuration.problem_size.n(), configuration.ldd);
|
||||
|
||||
}
|
||||
|
||||
bool cublasLtGemmExDispatcher::get_cublaslt_algo(cublasLtHandle_t handle,
|
||||
AlgorithmMode algorithm_mode
|
||||
){
|
||||
const int requestedAlgoCount = 8; //By default gets 8 algorithms from GetHeuristic Call. CublasLt heuristics provide at max 8 algorithms.
|
||||
int returnedResults = 0;
|
||||
cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {};
|
||||
|
||||
#if (__CUDACC_VER_MAJOR__ >= 12)
|
||||
//Decide based upon the unique operation identifier whether to turn on fast accum for cublas kernel or not.
|
||||
std::string operation_name(op_desc.name);
|
||||
if(operation_name.find("fastaccum") != std::string::npos){
|
||||
const int8_t fastAccuMode = 1;
|
||||
cublasLtMatmulDescSetAttribute(operationDesc,
|
||||
CUBLASLT_MATMUL_DESC_FAST_ACCUM,
|
||||
&fastAccuMode,
|
||||
sizeof(fastAccuMode));
|
||||
}
|
||||
#endif // __CUDACC_VER_MAJOR__ >= 12
|
||||
|
||||
//Using 32MB for hopper kernel. This is the max workspace size for the call to cublasLtMatmulAlgoGetHeuristic()
|
||||
size_t workspaceSizeForHeuristics = 32ULL * 1024 * 1024;
|
||||
void* workspaceHeuristic = nullptr;
|
||||
|
||||
cudaError_t result = cudaMalloc((void **)&workspaceHeuristic, workspaceSizeForHeuristics);
|
||||
if (result != cudaSuccess) {
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
||||
// create preference handle; here we could use extra attributes to disable tensor ops or to make sure algo selected
|
||||
// will work with badly aligned A, B, C; here for simplicity we just assume A,B,C are always well aligned (e.g.
|
||||
// directly come from cudaMalloc)
|
||||
cublasLtMatmulPreferenceCreate(&preference);
|
||||
cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSizeForHeuristics, sizeof(workspaceSizeForHeuristics));
|
||||
|
||||
cublasLtMatmulAlgoGetHeuristic(handle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, preference, requestedAlgoCount, heuristicResult, &returnedResults);
|
||||
|
||||
if (returnedResults == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int bestAlgoIdx = 0;
|
||||
//
|
||||
//Auto Tuning to get the best kernel for the given problem
|
||||
//
|
||||
if (algorithm_mode == AlgorithmMode::kBest) {
|
||||
float time = 0;
|
||||
float bestAlgoTime = 0;
|
||||
cudaStream_t stream;
|
||||
cudaEvent_t startEvent, stopEvent;
|
||||
|
||||
cudaStreamCreate(&stream);
|
||||
cudaEventCreate(&startEvent);
|
||||
cudaEventCreate(&stopEvent);
|
||||
|
||||
constexpr int repeatAlgoCheck = 5;
|
||||
std::vector<float> algoTimes(repeatAlgoCheck);
|
||||
|
||||
for (int algoIdx = 0; algoIdx < returnedResults; algoIdx++) {
|
||||
for (int checkIdx = 0; checkIdx < repeatAlgoCheck; checkIdx++) {
|
||||
cudaEventRecord(startEvent, stream);
|
||||
|
||||
cublasStatus_t status = cublasLtMatmul(handle,
|
||||
operationDesc,
|
||||
arguments.alpha,
|
||||
arguments.A,
|
||||
Adesc,
|
||||
arguments.B,
|
||||
Bdesc,
|
||||
arguments.beta,
|
||||
arguments.C,
|
||||
Cdesc,
|
||||
arguments.D,
|
||||
Ddesc,
|
||||
&heuristicResult[algoIdx].algo,
|
||||
workspaceHeuristic,
|
||||
heuristicResult[algoIdx].workspaceSize,
|
||||
stream);
|
||||
|
||||
// Handle errors
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
std::cerr << "cublasLtMatmul AutoTuning failed with status: " << cublasLtGetStatusName(status) << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
cudaEventRecord(stopEvent, stream);
|
||||
cudaEventSynchronize(stopEvent);
|
||||
cudaEventElapsedTime(&time, startEvent, stopEvent);
|
||||
algoTimes[checkIdx] = time;
|
||||
|
||||
}
|
||||
|
||||
const size_t size = algoTimes.size();
|
||||
if (size == 0) {
|
||||
time = 0;
|
||||
}
|
||||
|
||||
std::sort(algoTimes.begin(), algoTimes.end());
|
||||
|
||||
const size_t mid = size / 2;
|
||||
if (size % 2 == 0) {
|
||||
time = (algoTimes[mid] + algoTimes[mid - 1]) / 2;
|
||||
}
|
||||
else {
|
||||
time = algoTimes[mid];
|
||||
}
|
||||
|
||||
if (algoIdx == 0 || time < bestAlgoTime) {
|
||||
bestAlgoTime = time;
|
||||
bestAlgoIdx = algoIdx;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
|
||||
std::cout << "\n";
|
||||
std::cout << "# Algorithms checked: " << returnedResults << "\n";
|
||||
std::cout << "WorkspaceSize Allocated: " << heuristicResult[bestAlgoIdx].workspaceSize << "\n";
|
||||
std::cout << "Algorithm selected after auto-tuning is:" << "\n";
|
||||
|
||||
int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme;
|
||||
|
||||
cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
|
||||
cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
|
||||
cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
|
||||
cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
|
||||
cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
|
||||
cublasLtMatmulAlgoConfigGetAttribute(&heuristicResult[bestAlgoIdx].algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
|
||||
|
||||
printf("algo={ Id=%d, tileIdx=%d splitK=%d reduc=%d swizzle=%d custom=%d }\n",
|
||||
algoId, tile, numSplitsK, reductionScheme, swizzle, customOption);
|
||||
#endif
|
||||
|
||||
if (stream) cudaStreamDestroy(stream);
|
||||
if (startEvent) cudaEventDestroy(startEvent);
|
||||
if (stopEvent) cudaEventDestroy(stopEvent);
|
||||
|
||||
}
|
||||
|
||||
//setting algorithm for the dispatcher
|
||||
heuristicResult_ = heuristicResult[bestAlgoIdx];
|
||||
result = cudaMalloc((void **)&workspace, heuristicResult_.workspaceSize);
|
||||
if (result != cudaSuccess) {
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
cublasStatus_t cublasLtGemmExDispatcher::operator()(cublasLtHandle_t handle)
|
||||
{
|
||||
return cublasLtMatmul(handle,
|
||||
operationDesc,
|
||||
arguments.alpha,
|
||||
arguments.A,
|
||||
Adesc,
|
||||
arguments.B,
|
||||
Bdesc,
|
||||
arguments.beta,
|
||||
arguments.C,
|
||||
Cdesc,
|
||||
arguments.D,
|
||||
Ddesc,
|
||||
&heuristicResult_.algo,
|
||||
workspace,
|
||||
heuristicResult_.workspaceSize,
|
||||
0); //number of streams is set to 0
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
// namespace detail
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
@ -208,19 +208,6 @@ void CutlassProfiler::print_options_(std::ostream &out) {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Initializes the CUDA device
|
||||
void CutlassProfiler::initialize_device_() {
|
||||
|
||||
cudaError_t result = cudaSetDevice(options_.device.device);
|
||||
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "Failed to set device.";
|
||||
throw std::runtime_error("Failed to set device");
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace cutlass
|
||||
|
||||
|
||||
@ -88,16 +88,16 @@ static std::vector<int64_t> get_packed_layout_stride(std::vector<int> const &ext
|
||||
|
||||
/// Returns the stride of a packed layout
|
||||
std::vector<int64_t> DeviceAllocation::get_packed_layout(
|
||||
library::LayoutTypeID layout_id,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent) {
|
||||
|
||||
std::vector<int64_t> stride;
|
||||
|
||||
switch (layout_id) {
|
||||
case library::LayoutTypeID::kColumnMajor:
|
||||
case library::LayoutTypeID::kColumnMajor:
|
||||
stride = get_packed_layout_stride<cutlass::layout::ColumnMajor>(extent);
|
||||
break;
|
||||
case library::LayoutTypeID::kRowMajor:
|
||||
case library::LayoutTypeID::kRowMajor:
|
||||
stride = get_packed_layout_stride<cutlass::layout::RowMajor>(extent);
|
||||
break;
|
||||
case library::LayoutTypeID::kColumnMajorInterleavedK2:
|
||||
@ -159,7 +159,7 @@ std::vector<int64_t> DeviceAllocation::get_packed_layout(
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Template to use CUTLASS Layout functions to
|
||||
/// Template to use CUTLASS Layout functions to
|
||||
template <typename Layout>
|
||||
static size_t construct_layout_(
|
||||
void *bytes,
|
||||
@ -177,8 +177,8 @@ static size_t construct_layout_(
|
||||
stride = get_packed_layout_stride<Layout>(extent);
|
||||
|
||||
return construct_layout_<Layout>(
|
||||
bytes,
|
||||
layout_id,
|
||||
bytes,
|
||||
layout_id,
|
||||
extent,
|
||||
stride);
|
||||
}
|
||||
@ -202,7 +202,7 @@ static size_t construct_layout_(
|
||||
|
||||
// Pack it into bytes
|
||||
if (bytes) {
|
||||
*reinterpret_cast<Layout *>(bytes) = layout;
|
||||
*reinterpret_cast<Layout *>(bytes) = layout;
|
||||
}
|
||||
|
||||
// Return capacity
|
||||
@ -219,10 +219,10 @@ size_t DeviceAllocation::construct_layout(
|
||||
std::vector<int64_t> &stride) {
|
||||
|
||||
switch (layout_id) {
|
||||
case library::LayoutTypeID::kColumnMajor:
|
||||
case library::LayoutTypeID::kColumnMajor:
|
||||
return construct_layout_<cutlass::layout::ColumnMajor>(bytes, layout_id, extent, stride);
|
||||
|
||||
case library::LayoutTypeID::kRowMajor:
|
||||
|
||||
case library::LayoutTypeID::kRowMajor:
|
||||
return construct_layout_<cutlass::layout::RowMajor>(bytes, layout_id, extent, stride);
|
||||
|
||||
case library::LayoutTypeID::kColumnMajorInterleavedK2:
|
||||
@ -284,24 +284,26 @@ size_t DeviceAllocation::construct_layout(
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
DeviceAllocation::DeviceAllocation():
|
||||
type_(library::NumericTypeID::kInvalid),
|
||||
DeviceAllocation::DeviceAllocation():
|
||||
type_(library::NumericTypeID::kInvalid),
|
||||
batch_stride_(0),
|
||||
capacity_(0),
|
||||
capacity_(0),
|
||||
pointer_(nullptr),
|
||||
layout_(library::LayoutTypeID::kUnknown),
|
||||
batch_count_(1) {
|
||||
batch_count_(1),
|
||||
device_(-1) {
|
||||
|
||||
}
|
||||
|
||||
DeviceAllocation::DeviceAllocation(
|
||||
library::NumericTypeID type,
|
||||
size_t capacity
|
||||
library::NumericTypeID type,
|
||||
size_t capacity,
|
||||
int device
|
||||
):
|
||||
type_(type), batch_stride_(capacity), capacity_(capacity), pointer_(nullptr),
|
||||
layout_(library::LayoutTypeID::kUnknown), batch_count_(1) {
|
||||
type_(type), batch_stride_(capacity), capacity_(capacity), pointer_(nullptr),
|
||||
layout_(library::LayoutTypeID::kUnknown), batch_count_(1), device_(device) {
|
||||
|
||||
cudaError_t result = cudaMalloc((void **)&pointer_, bytes(type, capacity));
|
||||
cudaError_t result = this->malloc((void **)&pointer_, bytes(type, capacity));
|
||||
|
||||
if (result != cudaSuccess) {
|
||||
type_ = library::NumericTypeID::kInvalid;
|
||||
@ -312,13 +314,15 @@ DeviceAllocation::DeviceAllocation(
|
||||
}
|
||||
|
||||
DeviceAllocation::DeviceAllocation(
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count
|
||||
int batch_count,
|
||||
int device
|
||||
):
|
||||
type_(type), batch_stride_(size_t(0)), capacity_(size_t(0)), pointer_(nullptr), batch_count_(1) {
|
||||
type_(type), batch_stride_(size_t(0)), capacity_(size_t(0)),
|
||||
pointer_(nullptr), batch_count_(1), device_(device) {
|
||||
|
||||
reset(type, layout_id, extent, stride, batch_count);
|
||||
}
|
||||
@ -355,7 +359,7 @@ DeviceAllocation &DeviceAllocation::reset(library::NumericTypeID type, size_t ca
|
||||
batch_stride_ = capacity;
|
||||
capacity_ = capacity;
|
||||
|
||||
cudaError_t result = cudaMalloc((void **)&pointer_, bytes(type_, capacity_));
|
||||
cudaError_t result = this->malloc((void **)&pointer_, bytes(type_, capacity_));
|
||||
if (result != cudaSuccess) {
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
@ -373,9 +377,9 @@ DeviceAllocation &DeviceAllocation::reset(library::NumericTypeID type, size_t ca
|
||||
|
||||
/// Allocates memory for a given layout and tensor
|
||||
DeviceAllocation &DeviceAllocation::reset(
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count) {
|
||||
|
||||
@ -391,14 +395,14 @@ DeviceAllocation &DeviceAllocation::reset(
|
||||
batch_count_ = batch_count;
|
||||
|
||||
batch_stride_ = construct_layout(
|
||||
tensor_ref_buffer_.data() + sizeof(pointer_),
|
||||
layout_id,
|
||||
extent,
|
||||
tensor_ref_buffer_.data() + sizeof(pointer_),
|
||||
layout_id,
|
||||
extent,
|
||||
stride_);
|
||||
|
||||
capacity_ = batch_stride_ * batch_count_;
|
||||
|
||||
cudaError_t result = cudaMalloc((void **)&pointer_, bytes(type, capacity_));
|
||||
cudaError_t result = this->malloc((void **)&pointer_, bytes(type, capacity_));
|
||||
if (result != cudaSuccess) {
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
@ -421,7 +425,7 @@ void *DeviceAllocation::data() const {
|
||||
}
|
||||
|
||||
void *DeviceAllocation::batch_data(int batch_idx) const {
|
||||
return static_cast<char *>(data()) + batch_stride_bytes() * batch_idx;
|
||||
return static_cast<char *>(data()) + batch_stride_bytes() * batch_idx;
|
||||
}
|
||||
|
||||
library::LayoutTypeID DeviceAllocation::layout() const {
|
||||
@ -1476,159 +1480,159 @@ void DeviceAllocation::initialize_random_sparsemeta_host(int seed, int MetaSizeI
|
||||
|
||||
/// Returns true if two blocks have exactly the same value
|
||||
bool DeviceAllocation::block_compare_equal(
|
||||
library::NumericTypeID numeric_type,
|
||||
void const *ptr_A,
|
||||
void const *ptr_B,
|
||||
library::NumericTypeID numeric_type,
|
||||
void const *ptr_A,
|
||||
void const *ptr_B,
|
||||
size_t capacity) {
|
||||
|
||||
switch (numeric_type) {
|
||||
case library::NumericTypeID::kFE4M3:
|
||||
return reference::device::BlockCompareEqual<float_e4m3_t>(
|
||||
reinterpret_cast<float_e4m3_t const *>(ptr_A),
|
||||
reinterpret_cast<float_e4m3_t const *>(ptr_B),
|
||||
reinterpret_cast<float_e4m3_t const *>(ptr_A),
|
||||
reinterpret_cast<float_e4m3_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
|
||||
case library::NumericTypeID::kFE5M2:
|
||||
return reference::device::BlockCompareEqual<float_e5m2_t>(
|
||||
reinterpret_cast<float_e5m2_t const *>(ptr_A),
|
||||
reinterpret_cast<float_e5m2_t const *>(ptr_B),
|
||||
reinterpret_cast<float_e5m2_t const *>(ptr_B),
|
||||
capacity);
|
||||
case library::NumericTypeID::kF16:
|
||||
return reference::device::BlockCompareEqual<half_t>(
|
||||
reinterpret_cast<half_t const *>(ptr_A),
|
||||
reinterpret_cast<half_t const *>(ptr_B),
|
||||
reinterpret_cast<half_t const *>(ptr_A),
|
||||
reinterpret_cast<half_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
|
||||
case library::NumericTypeID::kBF16:
|
||||
return reference::device::BlockCompareEqual<bfloat16_t>(
|
||||
reinterpret_cast<bfloat16_t const *>(ptr_A),
|
||||
reinterpret_cast<bfloat16_t const *>(ptr_B),
|
||||
reinterpret_cast<bfloat16_t const *>(ptr_A),
|
||||
reinterpret_cast<bfloat16_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kTF32:
|
||||
return reference::device::BlockCompareEqual<tfloat32_t>(
|
||||
reinterpret_cast<tfloat32_t const *>(ptr_A),
|
||||
reinterpret_cast<tfloat32_t const *>(ptr_B),
|
||||
reinterpret_cast<tfloat32_t const *>(ptr_A),
|
||||
reinterpret_cast<tfloat32_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kF32:
|
||||
return reference::device::BlockCompareEqual<float>(
|
||||
reinterpret_cast<float const *>(ptr_A),
|
||||
reinterpret_cast<float const *>(ptr_B),
|
||||
reinterpret_cast<float const *>(ptr_A),
|
||||
reinterpret_cast<float const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kCF32:
|
||||
return reference::device::BlockCompareEqual<cutlass::complex<float> >(
|
||||
reinterpret_cast<complex<float> const *>(ptr_A),
|
||||
reinterpret_cast<complex<float> const *>(ptr_B),
|
||||
reinterpret_cast<complex<float> const *>(ptr_A),
|
||||
reinterpret_cast<complex<float> const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
|
||||
case library::NumericTypeID::kCF16:
|
||||
return reference::device::BlockCompareEqual<complex<half_t>>(
|
||||
reinterpret_cast<complex<half_t> const *>(ptr_A),
|
||||
reinterpret_cast<complex<half_t> const *>(ptr_B),
|
||||
reinterpret_cast<complex<half_t> const *>(ptr_A),
|
||||
reinterpret_cast<complex<half_t> const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
|
||||
case library::NumericTypeID::kCBF16:
|
||||
return reference::device::BlockCompareEqual<complex<bfloat16_t>>(
|
||||
reinterpret_cast<complex<bfloat16_t> const *>(ptr_A),
|
||||
reinterpret_cast<complex<bfloat16_t> const *>(ptr_B),
|
||||
reinterpret_cast<complex<bfloat16_t> const *>(ptr_A),
|
||||
reinterpret_cast<complex<bfloat16_t> const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kCTF32:
|
||||
return reference::device::BlockCompareEqual<complex<tfloat32_t>>(
|
||||
reinterpret_cast<complex<tfloat32_t> const *>(ptr_A),
|
||||
reinterpret_cast<complex<tfloat32_t> const *>(ptr_B),
|
||||
reinterpret_cast<complex<tfloat32_t> const *>(ptr_A),
|
||||
reinterpret_cast<complex<tfloat32_t> const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
|
||||
case library::NumericTypeID::kF64:
|
||||
return reference::device::BlockCompareEqual<double>(
|
||||
reinterpret_cast<double const *>(ptr_A),
|
||||
reinterpret_cast<double const *>(ptr_B),
|
||||
reinterpret_cast<double const *>(ptr_A),
|
||||
reinterpret_cast<double const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kCF64:
|
||||
return reference::device::BlockCompareEqual<complex<double>>(
|
||||
reinterpret_cast<complex<double> const *>(ptr_A),
|
||||
reinterpret_cast<complex<double> const *>(ptr_B),
|
||||
reinterpret_cast<complex<double> const *>(ptr_A),
|
||||
reinterpret_cast<complex<double> const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
|
||||
case library::NumericTypeID::kS2:
|
||||
return reference::device::BlockCompareEqual<int2b_t>(
|
||||
reinterpret_cast<int2b_t const *>(ptr_A),
|
||||
reinterpret_cast<int2b_t const *>(ptr_B),
|
||||
reinterpret_cast<int2b_t const *>(ptr_A),
|
||||
reinterpret_cast<int2b_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kS4:
|
||||
return reference::device::BlockCompareEqual<int4b_t>(
|
||||
reinterpret_cast<int4b_t const *>(ptr_A),
|
||||
reinterpret_cast<int4b_t const *>(ptr_B),
|
||||
reinterpret_cast<int4b_t const *>(ptr_A),
|
||||
reinterpret_cast<int4b_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kS8:
|
||||
return reference::device::BlockCompareEqual<int8_t>(
|
||||
reinterpret_cast<int8_t const *>(ptr_A),
|
||||
reinterpret_cast<int8_t const *>(ptr_B),
|
||||
reinterpret_cast<int8_t const *>(ptr_A),
|
||||
reinterpret_cast<int8_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kS16:
|
||||
return reference::device::BlockCompareEqual<int16_t>(
|
||||
reinterpret_cast<int16_t const *>(ptr_A),
|
||||
reinterpret_cast<int16_t const *>(ptr_B),
|
||||
reinterpret_cast<int16_t const *>(ptr_A),
|
||||
reinterpret_cast<int16_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kS32:
|
||||
return reference::device::BlockCompareEqual<int32_t>(
|
||||
reinterpret_cast<int32_t const *>(ptr_A),
|
||||
reinterpret_cast<int32_t const *>(ptr_B),
|
||||
reinterpret_cast<int32_t const *>(ptr_A),
|
||||
reinterpret_cast<int32_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kS64:
|
||||
return reference::device::BlockCompareEqual<int64_t>(
|
||||
reinterpret_cast<int64_t const *>(ptr_A),
|
||||
reinterpret_cast<int64_t const *>(ptr_B),
|
||||
reinterpret_cast<int64_t const *>(ptr_A),
|
||||
reinterpret_cast<int64_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
|
||||
case library::NumericTypeID::kB1:
|
||||
return reference::device::BlockCompareEqual<uint1b_t>(
|
||||
reinterpret_cast<uint1b_t const *>(ptr_A),
|
||||
reinterpret_cast<uint1b_t const *>(ptr_B),
|
||||
reinterpret_cast<uint1b_t const *>(ptr_A),
|
||||
reinterpret_cast<uint1b_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
|
||||
case library::NumericTypeID::kU2:
|
||||
return reference::device::BlockCompareEqual<uint2b_t>(
|
||||
reinterpret_cast<uint2b_t const *>(ptr_A),
|
||||
reinterpret_cast<uint2b_t const *>(ptr_B),
|
||||
reinterpret_cast<uint2b_t const *>(ptr_A),
|
||||
reinterpret_cast<uint2b_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
|
||||
case library::NumericTypeID::kU4:
|
||||
return reference::device::BlockCompareEqual<uint4b_t>(
|
||||
reinterpret_cast<uint4b_t const *>(ptr_A),
|
||||
reinterpret_cast<uint4b_t const *>(ptr_B),
|
||||
reinterpret_cast<uint4b_t const *>(ptr_A),
|
||||
reinterpret_cast<uint4b_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kU8:
|
||||
return reference::device::BlockCompareEqual<uint8_t>(
|
||||
reinterpret_cast<uint8_t const *>(ptr_A),
|
||||
reinterpret_cast<uint8_t const *>(ptr_B),
|
||||
reinterpret_cast<uint8_t const *>(ptr_A),
|
||||
reinterpret_cast<uint8_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kU16:
|
||||
return reference::device::BlockCompareEqual<uint16_t>(
|
||||
reinterpret_cast<uint16_t const *>(ptr_A),
|
||||
reinterpret_cast<uint16_t const *>(ptr_B),
|
||||
reinterpret_cast<uint16_t const *>(ptr_A),
|
||||
reinterpret_cast<uint16_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kU32:
|
||||
return reference::device::BlockCompareEqual<uint32_t>(
|
||||
reinterpret_cast<uint32_t const *>(ptr_A),
|
||||
reinterpret_cast<uint32_t const *>(ptr_B),
|
||||
reinterpret_cast<uint32_t const *>(ptr_A),
|
||||
reinterpret_cast<uint32_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
case library::NumericTypeID::kU64:
|
||||
return reference::device::BlockCompareEqual<uint64_t>(
|
||||
reinterpret_cast<uint64_t const *>(ptr_A),
|
||||
reinterpret_cast<uint64_t const *>(ptr_B),
|
||||
reinterpret_cast<uint64_t const *>(ptr_A),
|
||||
reinterpret_cast<uint64_t const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
default:
|
||||
@ -1638,9 +1642,9 @@ bool DeviceAllocation::block_compare_equal(
|
||||
|
||||
/// Returns true if two blocks have approximately the same value
|
||||
bool DeviceAllocation::block_compare_relatively_equal(
|
||||
library::NumericTypeID numeric_type,
|
||||
void const *ptr_A,
|
||||
void const *ptr_B,
|
||||
library::NumericTypeID numeric_type,
|
||||
void const *ptr_A,
|
||||
void const *ptr_B,
|
||||
size_t capacity,
|
||||
double epsilon,
|
||||
double nonzero_floor) {
|
||||
@ -1648,161 +1652,161 @@ bool DeviceAllocation::block_compare_relatively_equal(
|
||||
switch (numeric_type) {
|
||||
case library::NumericTypeID::kFE4M3:
|
||||
return reference::device::BlockCompareRelativelyEqual<float_e4m3_t>(
|
||||
reinterpret_cast<float_e4m3_t const *>(ptr_A),
|
||||
reinterpret_cast<float_e4m3_t const *>(ptr_A),
|
||||
reinterpret_cast<float_e4m3_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<float_e4m3_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<float_e4m3_t>(epsilon),
|
||||
static_cast<float_e4m3_t>(nonzero_floor));
|
||||
|
||||
|
||||
case library::NumericTypeID::kFE5M2:
|
||||
return reference::device::BlockCompareRelativelyEqual<float_e5m2_t>(
|
||||
reinterpret_cast<float_e5m2_t const *>(ptr_A),
|
||||
reinterpret_cast<float_e5m2_t const *>(ptr_A),
|
||||
reinterpret_cast<float_e5m2_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<float_e5m2_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<float_e5m2_t>(epsilon),
|
||||
static_cast<float_e5m2_t>(nonzero_floor));
|
||||
case library::NumericTypeID::kF16:
|
||||
return reference::device::BlockCompareRelativelyEqual<half_t>(
|
||||
reinterpret_cast<half_t const *>(ptr_A),
|
||||
reinterpret_cast<half_t const *>(ptr_A),
|
||||
reinterpret_cast<half_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<half_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<half_t>(epsilon),
|
||||
static_cast<half_t>(nonzero_floor));
|
||||
|
||||
|
||||
case library::NumericTypeID::kBF16:
|
||||
return reference::device::BlockCompareRelativelyEqual<bfloat16_t>(
|
||||
reinterpret_cast<bfloat16_t const *>(ptr_A),
|
||||
reinterpret_cast<bfloat16_t const *>(ptr_A),
|
||||
reinterpret_cast<bfloat16_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<bfloat16_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<bfloat16_t>(epsilon),
|
||||
static_cast<bfloat16_t>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kTF32:
|
||||
return reference::device::BlockCompareRelativelyEqual<tfloat32_t>(
|
||||
reinterpret_cast<tfloat32_t const *>(ptr_A),
|
||||
reinterpret_cast<tfloat32_t const *>(ptr_A),
|
||||
reinterpret_cast<tfloat32_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<tfloat32_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<tfloat32_t>(epsilon),
|
||||
static_cast<tfloat32_t>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kF32:
|
||||
return reference::device::BlockCompareRelativelyEqual<float>(
|
||||
reinterpret_cast<float const *>(ptr_A),
|
||||
reinterpret_cast<float const *>(ptr_A),
|
||||
reinterpret_cast<float const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<float>(epsilon),
|
||||
capacity,
|
||||
static_cast<float>(epsilon),
|
||||
static_cast<float>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kF64:
|
||||
return reference::device::BlockCompareRelativelyEqual<double>(
|
||||
reinterpret_cast<double const *>(ptr_A),
|
||||
reinterpret_cast<double const *>(ptr_A),
|
||||
reinterpret_cast<double const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<double>(epsilon),
|
||||
capacity,
|
||||
static_cast<double>(epsilon),
|
||||
static_cast<double>(nonzero_floor));
|
||||
|
||||
|
||||
case library::NumericTypeID::kS2:
|
||||
return reference::device::BlockCompareRelativelyEqual<int2b_t>(
|
||||
reinterpret_cast<int2b_t const *>(ptr_A),
|
||||
reinterpret_cast<int2b_t const *>(ptr_A),
|
||||
reinterpret_cast<int2b_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<int2b_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<int2b_t>(epsilon),
|
||||
static_cast<int2b_t>(nonzero_floor));
|
||||
|
||||
|
||||
case library::NumericTypeID::kS4:
|
||||
return reference::device::BlockCompareRelativelyEqual<int4b_t>(
|
||||
reinterpret_cast<int4b_t const *>(ptr_A),
|
||||
reinterpret_cast<int4b_t const *>(ptr_A),
|
||||
reinterpret_cast<int4b_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<int4b_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<int4b_t>(epsilon),
|
||||
static_cast<int4b_t>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kS8:
|
||||
return reference::device::BlockCompareRelativelyEqual<int8_t>(
|
||||
reinterpret_cast<int8_t const *>(ptr_A),
|
||||
reinterpret_cast<int8_t const *>(ptr_A),
|
||||
reinterpret_cast<int8_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<int8_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<int8_t>(epsilon),
|
||||
static_cast<int8_t>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kS16:
|
||||
return reference::device::BlockCompareRelativelyEqual<int16_t>(
|
||||
reinterpret_cast<int16_t const *>(ptr_A),
|
||||
reinterpret_cast<int16_t const *>(ptr_A),
|
||||
reinterpret_cast<int16_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<int16_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<int16_t>(epsilon),
|
||||
static_cast<int16_t>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kS32:
|
||||
return reference::device::BlockCompareRelativelyEqual<int32_t>(
|
||||
reinterpret_cast<int32_t const *>(ptr_A),
|
||||
reinterpret_cast<int32_t const *>(ptr_A),
|
||||
reinterpret_cast<int32_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<int32_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<int32_t>(epsilon),
|
||||
static_cast<int32_t>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kS64:
|
||||
return reference::device::BlockCompareRelativelyEqual<int64_t>(
|
||||
reinterpret_cast<int64_t const *>(ptr_A),
|
||||
reinterpret_cast<int64_t const *>(ptr_A),
|
||||
reinterpret_cast<int64_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<int64_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<int64_t>(epsilon),
|
||||
static_cast<int64_t>(nonzero_floor));
|
||||
|
||||
|
||||
case library::NumericTypeID::kB1:
|
||||
return reference::device::BlockCompareRelativelyEqual<uint1b_t>(
|
||||
reinterpret_cast<uint1b_t const *>(ptr_A),
|
||||
reinterpret_cast<uint1b_t const *>(ptr_A),
|
||||
reinterpret_cast<uint1b_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<uint1b_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<uint1b_t>(epsilon),
|
||||
static_cast<uint1b_t>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kU2:
|
||||
return reference::device::BlockCompareRelativelyEqual<uint2b_t>(
|
||||
reinterpret_cast<uint2b_t const *>(ptr_A),
|
||||
reinterpret_cast<uint2b_t const *>(ptr_A),
|
||||
reinterpret_cast<uint2b_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<uint2b_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<uint2b_t>(epsilon),
|
||||
static_cast<uint2b_t>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kU4:
|
||||
return reference::device::BlockCompareRelativelyEqual<uint4b_t>(
|
||||
reinterpret_cast<uint4b_t const *>(ptr_A),
|
||||
reinterpret_cast<uint4b_t const *>(ptr_A),
|
||||
reinterpret_cast<uint4b_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<uint4b_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<uint4b_t>(epsilon),
|
||||
static_cast<uint4b_t>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kU8:
|
||||
return reference::device::BlockCompareRelativelyEqual<uint8_t>(
|
||||
reinterpret_cast<uint8_t const *>(ptr_A),
|
||||
reinterpret_cast<uint8_t const *>(ptr_A),
|
||||
reinterpret_cast<uint8_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<uint8_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<uint8_t>(epsilon),
|
||||
static_cast<uint8_t>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kU16:
|
||||
return reference::device::BlockCompareRelativelyEqual<uint16_t>(
|
||||
reinterpret_cast<uint16_t const *>(ptr_A),
|
||||
reinterpret_cast<uint16_t const *>(ptr_A),
|
||||
reinterpret_cast<uint16_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<uint16_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<uint16_t>(epsilon),
|
||||
static_cast<uint16_t>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kU32:
|
||||
return reference::device::BlockCompareRelativelyEqual<uint32_t>(
|
||||
reinterpret_cast<uint32_t const *>(ptr_A),
|
||||
reinterpret_cast<uint32_t const *>(ptr_A),
|
||||
reinterpret_cast<uint32_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<uint32_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<uint32_t>(epsilon),
|
||||
static_cast<uint32_t>(nonzero_floor));
|
||||
|
||||
case library::NumericTypeID::kU64:
|
||||
return reference::device::BlockCompareRelativelyEqual<uint64_t>(
|
||||
reinterpret_cast<uint64_t const *>(ptr_A),
|
||||
reinterpret_cast<uint64_t const *>(ptr_A),
|
||||
reinterpret_cast<uint64_t const *>(ptr_B),
|
||||
capacity,
|
||||
static_cast<uint64_t>(epsilon),
|
||||
capacity,
|
||||
static_cast<uint64_t>(epsilon),
|
||||
static_cast<uint64_t>(nonzero_floor));
|
||||
|
||||
// No relatively equal comparison for complex numbers.
|
||||
@ -1821,7 +1825,7 @@ bool DeviceAllocation::block_compare_relatively_equal(
|
||||
reinterpret_cast<complex<float> const *>(ptr_A),
|
||||
reinterpret_cast<complex<float> const *>(ptr_B),
|
||||
capacity);
|
||||
|
||||
|
||||
case library::NumericTypeID::kCF64:
|
||||
return reference::device::BlockCompareEqual<cutlass::complex<double> >(
|
||||
reinterpret_cast<complex<double> const *>(ptr_A),
|
||||
@ -1837,14 +1841,14 @@ bool DeviceAllocation::block_compare_relatively_equal(
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Permits copying dynamic vectors into static-length vectors
|
||||
/// Permits copying dynamic vectors into static-length vectors
|
||||
template <typename TensorCoord, int Rank>
|
||||
struct vector_to_coord {
|
||||
|
||||
|
||||
vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {
|
||||
|
||||
coord[Rank - 1] = vec.at(Rank - 1);
|
||||
|
||||
|
||||
if (Rank > 1) {
|
||||
vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
|
||||
}
|
||||
@ -1853,17 +1857,17 @@ struct vector_to_coord {
|
||||
vector_to_coord(TensorCoord &coord, std::vector<int64_t> const &vec) {
|
||||
|
||||
coord[Rank - 1] = (int)vec.at(Rank - 1);
|
||||
|
||||
|
||||
if (Rank > 1) {
|
||||
vector_to_coord<TensorCoord, Rank - 1>(coord, vec);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/// Permits copying dynamic vectors into static-length vectors
|
||||
/// Permits copying dynamic vectors into static-length vectors
|
||||
template <typename TensorCoord>
|
||||
struct vector_to_coord<TensorCoord, 1> {
|
||||
|
||||
|
||||
vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {
|
||||
|
||||
coord[0] = vec.at(0);
|
||||
@ -1875,10 +1879,10 @@ struct vector_to_coord<TensorCoord, 1> {
|
||||
}
|
||||
};
|
||||
|
||||
/// Permits copying dynamic vectors into static-length vectors
|
||||
/// Permits copying dynamic vectors into static-length vectors
|
||||
template <typename TensorCoord>
|
||||
struct vector_to_coord<TensorCoord, 0> {
|
||||
|
||||
|
||||
vector_to_coord(TensorCoord &coord, std::vector<int> const &vec) {
|
||||
|
||||
}
|
||||
@ -1888,7 +1892,7 @@ struct vector_to_coord<TensorCoord, 0> {
|
||||
|
||||
template <typename Element, typename Layout>
|
||||
static void write_tensor_csv_static_tensor_view(
|
||||
std::ostream &out,
|
||||
std::ostream &out,
|
||||
DeviceAllocation &allocation) {
|
||||
|
||||
Coord<Layout::kRank> extent;
|
||||
@ -1903,7 +1907,7 @@ static void write_tensor_csv_static_tensor_view(
|
||||
}
|
||||
|
||||
vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
|
||||
vector_to_coord<Coord<Layout::kStrideRank, typename Layout::Stride::Index>,
|
||||
vector_to_coord<Coord<Layout::kStrideRank, typename Layout::Stride::Index>,
|
||||
Layout::kStrideRank>(stride, allocation.stride());
|
||||
|
||||
Layout layout(stride);
|
||||
@ -1914,7 +1918,7 @@ static void write_tensor_csv_static_tensor_view(
|
||||
}
|
||||
|
||||
host_tensor.copy_in_device_to_host(
|
||||
static_cast<Element const *>(allocation.data()),
|
||||
static_cast<Element const *>(allocation.data()),
|
||||
allocation.batch_stride());
|
||||
|
||||
TensorViewWrite(out, host_tensor.host_view());
|
||||
@ -1926,7 +1930,7 @@ static void write_tensor_csv_static_tensor_view(
|
||||
|
||||
template <typename T>
|
||||
static void write_tensor_csv_static_type(
|
||||
std::ostream &out,
|
||||
std::ostream &out,
|
||||
DeviceAllocation &allocation) {
|
||||
|
||||
switch (allocation.layout()) {
|
||||
@ -1991,7 +1995,7 @@ static void write_tensor_csv_static_type(
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Writes a tensor to csv
|
||||
/// Writes a tensor to csv
|
||||
void DeviceAllocation::write_tensor_csv(
|
||||
std::ostream &out) {
|
||||
|
||||
@ -1999,14 +2003,14 @@ void DeviceAllocation::write_tensor_csv(
|
||||
case library::NumericTypeID::kFE4M3:
|
||||
write_tensor_csv_static_type<float_e4m3_t>(out, *this);
|
||||
break;
|
||||
|
||||
|
||||
case library::NumericTypeID::kFE5M2:
|
||||
write_tensor_csv_static_type<float_e5m2_t>(out, *this);
|
||||
break;
|
||||
case library::NumericTypeID::kF16:
|
||||
write_tensor_csv_static_type<half_t>(out, *this);
|
||||
break;
|
||||
|
||||
|
||||
case library::NumericTypeID::kBF16:
|
||||
write_tensor_csv_static_type<bfloat16_t>(out, *this);
|
||||
break;
|
||||
@ -2022,7 +2026,7 @@ void DeviceAllocation::write_tensor_csv(
|
||||
case library::NumericTypeID::kF64:
|
||||
write_tensor_csv_static_type<double>(out, *this);
|
||||
break;
|
||||
|
||||
|
||||
case library::NumericTypeID::kS2:
|
||||
write_tensor_csv_static_type<int2b_t>(out, *this);
|
||||
break;
|
||||
@ -2046,7 +2050,7 @@ void DeviceAllocation::write_tensor_csv(
|
||||
case library::NumericTypeID::kS64:
|
||||
write_tensor_csv_static_type<int64_t>(out, *this);
|
||||
break;
|
||||
|
||||
|
||||
case library::NumericTypeID::kB1:
|
||||
write_tensor_csv_static_type<uint1b_t>(out, *this);
|
||||
break;
|
||||
@ -2074,7 +2078,7 @@ void DeviceAllocation::write_tensor_csv(
|
||||
case library::NumericTypeID::kU64:
|
||||
write_tensor_csv_static_type<uint64_t>(out, *this);
|
||||
break;
|
||||
|
||||
|
||||
case library::NumericTypeID::kCF16:
|
||||
write_tensor_csv_static_type<cutlass::complex<half_t> >(out, *this);
|
||||
break;
|
||||
@ -2110,7 +2114,7 @@ static void tensor_fill_tensor_view(DeviceAllocation &allocation, Element val =
|
||||
}
|
||||
|
||||
vector_to_coord<Coord<Layout::kRank>, Layout::kRank>(extent, allocation.extent());
|
||||
vector_to_coord<Coord<Layout::kStrideRank, typename Layout::LongIndex>,
|
||||
vector_to_coord<Coord<Layout::kStrideRank, typename Layout::LongIndex>,
|
||||
Layout::kStrideRank>(stride, allocation.stride());
|
||||
|
||||
TensorView<Element, Layout> view(
|
||||
@ -2432,6 +2436,46 @@ void DeviceAllocation::fill_host(double val = 0.0) {
|
||||
copy_from_host(host_data.data());
|
||||
}
|
||||
|
||||
cudaError_t DeviceAllocation::malloc(void** ptr, size_t size) {
|
||||
cudaError_t result;
|
||||
int set_device_back_to = -1;
|
||||
|
||||
/// When needed this sets the device to the allocation's device remembering
|
||||
/// the current device so that it can be set back after the cudaMalloc is
|
||||
/// performed.
|
||||
if (device_ >= 0) {
|
||||
int current_device;
|
||||
result = cudaGetDevice(¤t_device);
|
||||
if (result != cudaSuccess) {
|
||||
return result;
|
||||
}
|
||||
|
||||
if (current_device != device_) {
|
||||
set_device_back_to = current_device;
|
||||
result = cudaSetDevice(device_);
|
||||
if (result != cudaSuccess) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This performs the cudaMalloc
|
||||
result = cudaMalloc(ptr, size);
|
||||
if (result != cudaSuccess) {
|
||||
return result;
|
||||
}
|
||||
|
||||
/// When needed this sets the device back to what it was when the function was
|
||||
/// called.
|
||||
if (set_device_back_to != -1) {
|
||||
result = cudaSetDevice(set_device_back_to);
|
||||
if (result != cudaSuccess) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
return cudaSuccess;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief
|
||||
\brief
|
||||
*/
|
||||
|
||||
#include "cutlass/profiler/device_context.h"
|
||||
@ -41,29 +41,16 @@ namespace profiler {
|
||||
|
||||
/// Allocates memory of a given type, capacity (elements), and name
|
||||
DeviceAllocation *DeviceContext::allocate_block(
|
||||
Options const &options,
|
||||
std::string const &name,
|
||||
library::NumericTypeID type,
|
||||
size_t capacity) {
|
||||
library::NumericTypeID type,
|
||||
size_t capacity,
|
||||
size_t device_index) {
|
||||
|
||||
device_memory_.emplace_back(type, capacity);
|
||||
int device = options.device.device_id(device_index);
|
||||
device_memory_.emplace_back(type, capacity, device);
|
||||
DeviceAllocation *allocation = &device_memory_.back();
|
||||
|
||||
allocations_[name] = allocation;
|
||||
return allocation;
|
||||
}
|
||||
|
||||
/// Allocates memory of a given type, capacity (elements), and name
|
||||
DeviceAllocation *DeviceContext::allocate_tensor(
|
||||
std::string const &name,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count) {
|
||||
|
||||
device_memory_.emplace_back(type, layout_id, extent, stride, batch_count);
|
||||
DeviceAllocation *allocation = &device_memory_.back();
|
||||
|
||||
allocations_[name] = allocation;
|
||||
return allocation;
|
||||
}
|
||||
@ -72,18 +59,40 @@ DeviceAllocation *DeviceContext::allocate_tensor(
|
||||
DeviceAllocation *DeviceContext::allocate_tensor(
|
||||
Options const &options,
|
||||
std::string const &name,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count,
|
||||
int seed_shift) {
|
||||
size_t device_index) {
|
||||
|
||||
DeviceAllocation *allocation =
|
||||
allocate_tensor(name, type, layout_id, extent, stride, batch_count);
|
||||
int device = options.device.device_id(device_index);
|
||||
device_memory_.emplace_back(type, layout_id, extent, stride, batch_count,
|
||||
device);
|
||||
DeviceAllocation *allocation = &device_memory_.back();
|
||||
|
||||
allocations_[name] = allocation;
|
||||
return allocation;
|
||||
}
|
||||
|
||||
/// Allocates memory of a given type, capacity (elements), and name
|
||||
DeviceAllocation *DeviceContext::allocate_and_initialize_tensor(
|
||||
Options const &options,
|
||||
std::string const &name,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count,
|
||||
int seed_shift,
|
||||
size_t device_index) {
|
||||
|
||||
DeviceAllocation *allocation =
|
||||
allocate_tensor(options, name, type, layout_id, extent, stride,
|
||||
batch_count, device_index);
|
||||
|
||||
if (options.initialization.enabled) {
|
||||
Distribution data_distribution = options.initialization.data_distribution;
|
||||
Distribution data_distribution = options.initialization.data_distribution;
|
||||
|
||||
// check if data distribution is allowed to change
|
||||
if(!options.initialization.fix_data_distribution) {
|
||||
@ -129,13 +138,13 @@ DeviceAllocation *DeviceContext::allocate_tensor(
|
||||
double stddev = data_distribution.gaussian.stddev;
|
||||
int scale = data_distribution.int_scale;
|
||||
|
||||
if (name == "A" && data_distribution.gaussian.pnzA != 100.0) {
|
||||
if (name == "A" && data_distribution.gaussian.pnzA != 1.0) {
|
||||
data_distribution.set_gaussian(mean, stddev, scale, data_distribution.gaussian.pnzA);
|
||||
}
|
||||
else if (name == "B" && data_distribution.gaussian.pnzB != 100.0) {
|
||||
else if (name == "B" && data_distribution.gaussian.pnzB != 1.0) {
|
||||
data_distribution.set_gaussian(mean, stddev, scale, data_distribution.gaussian.pnzB);
|
||||
}
|
||||
else if (name == "C" && data_distribution.gaussian.pnzC != 100.0) {
|
||||
else if (name == "C" && data_distribution.gaussian.pnzC != 1.0) {
|
||||
data_distribution.set_gaussian(mean, stddev, scale, data_distribution.gaussian.pnzC);
|
||||
}
|
||||
}
|
||||
@ -147,7 +156,7 @@ DeviceAllocation *DeviceContext::allocate_tensor(
|
||||
}
|
||||
else {
|
||||
allocation->initialize_random_device(
|
||||
options.initialization.seed + seed_shift,
|
||||
options.initialization.seed + seed_shift,
|
||||
data_distribution);
|
||||
}
|
||||
}
|
||||
@ -158,7 +167,7 @@ DeviceAllocation *DeviceContext::allocate_tensor(
|
||||
}
|
||||
else {
|
||||
allocation->initialize_random_host(
|
||||
options.initialization.seed + seed_shift,
|
||||
options.initialization.seed + seed_shift,
|
||||
data_distribution);
|
||||
}
|
||||
}
|
||||
@ -167,20 +176,22 @@ DeviceAllocation *DeviceContext::allocate_tensor(
|
||||
return allocation;
|
||||
}
|
||||
|
||||
/// Allocates memory for sparse meta data
|
||||
DeviceAllocation *DeviceContext::allocate_sparsemeta_tensor(
|
||||
/// Allocates memory for sparse meta data
|
||||
DeviceAllocation *DeviceContext::allocate_and_initialize_sparsemeta_tensor(
|
||||
Options const &options,
|
||||
std::string const &name,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
library::NumericTypeID type,
|
||||
library::LayoutTypeID layout_id,
|
||||
library::NumericTypeID type_a,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int> const &extent,
|
||||
std::vector<int64_t> const &stride,
|
||||
int batch_count,
|
||||
int seed_shift) {
|
||||
int seed_shift,
|
||||
size_t device_index) {
|
||||
|
||||
DeviceAllocation *allocation =
|
||||
allocate_tensor(name, type, layout_id, extent, stride, batch_count);
|
||||
DeviceAllocation *allocation =
|
||||
allocate_tensor(options, name, type, layout_id, extent, stride,
|
||||
batch_count, device_index);
|
||||
|
||||
if (options.initialization.enabled) {
|
||||
// TF32 has 4bit meta data. The rest has 2bit.
|
||||
@ -188,12 +199,12 @@ DeviceAllocation *DeviceContext::allocate_sparsemeta_tensor(
|
||||
|
||||
if (options.initialization.provider == library::Provider::kReferenceDevice) {
|
||||
allocation->initialize_random_sparsemeta_device(
|
||||
options.initialization.seed + seed_shift,
|
||||
options.initialization.seed + seed_shift,
|
||||
MetaSizeInBits);
|
||||
}
|
||||
else if (options.initialization.provider == library::Provider::kReferenceHost) {
|
||||
allocation->initialize_random_sparsemeta_host(
|
||||
options.initialization.seed + seed_shift,
|
||||
options.initialization.seed + seed_shift,
|
||||
MetaSizeInBits);
|
||||
}
|
||||
}
|
||||
|
||||
@ -39,6 +39,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "cutlass/core_io.h"
|
||||
#include <cuda_runtime_api.h>
|
||||
|
||||
#include "cutlass/profiler/cublas_helpers.h"
|
||||
#include "cutlass/profiler/gemm_operation_profiler.h"
|
||||
@ -46,7 +47,6 @@
|
||||
#include "cutlass/library/singleton.h"
|
||||
#include "cutlass/library/library.h"
|
||||
#include "cutlass/library/handle.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
@ -485,6 +485,17 @@ Status GemmOperationProfiler::initialize_workspace(
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
if (options.device.devices.size() != 1) {
|
||||
throw std::runtime_error("This operation profiler only supports a single "
|
||||
"device.");
|
||||
}
|
||||
|
||||
cudaError_t result;
|
||||
result = cudaSetDevice(options.device.device_id(0));
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaSetDevice() failed.");
|
||||
}
|
||||
|
||||
library::Operation const* underlying_operation = operation;
|
||||
|
||||
if (problem_.split_k_mode == library::SplitKMode::kParallel) {
|
||||
@ -496,12 +507,14 @@ Status GemmOperationProfiler::initialize_workspace(
|
||||
library::GemmDescription const &operation_desc =
|
||||
static_cast<library::GemmDescription const &>(operation->description());
|
||||
|
||||
bool is_sparse = operation_desc.tile_description.math_instruction.opcode_class == cutlass::library::OpcodeClassID::kSparseTensorOp;
|
||||
|
||||
// Compute the number of copies of the problem to avoid L2 camping.
|
||||
if (!options.profiling.workspace_count) {
|
||||
int64_t bytes = problem_.bytes(operation_desc);
|
||||
if (bytes < 3 * int64_t(options.device.properties.l2CacheSize)) {
|
||||
if (bytes < 3 * int64_t(options.device.properties[0].l2CacheSize)) {
|
||||
gemm_workspace_.problem_count =
|
||||
1 + int((3 * int64_t(options.device.properties.l2CacheSize)) / bytes);
|
||||
1 + int((3 * int64_t(options.device.properties[0].l2CacheSize)) / bytes);
|
||||
}
|
||||
else {
|
||||
gemm_workspace_.problem_count = 1;
|
||||
@ -514,7 +527,7 @@ Status GemmOperationProfiler::initialize_workspace(
|
||||
bool allocate_device_tensors = options.execution_mode != ExecutionMode::kDryRun;
|
||||
if (allocate_device_tensors) {
|
||||
int seed_shift = 0;
|
||||
gemm_workspace_.A = device_context.allocate_tensor(
|
||||
gemm_workspace_.A = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"A",
|
||||
operation_desc.A.element,
|
||||
@ -522,10 +535,11 @@ Status GemmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.m), int(problem_.k)},
|
||||
{int(problem_.lda)},
|
||||
problem_.batch_count * gemm_workspace_.problem_count,
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
gemm_workspace_.B = device_context.allocate_tensor(
|
||||
gemm_workspace_.B = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"B",
|
||||
operation_desc.B.element,
|
||||
@ -533,10 +547,11 @@ Status GemmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.k), int(problem_.n)},
|
||||
{int(problem_.ldb)},
|
||||
problem_.batch_count * gemm_workspace_.problem_count,
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
gemm_workspace_.C = device_context.allocate_tensor(
|
||||
gemm_workspace_.C = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"C",
|
||||
operation_desc.C.element,
|
||||
@ -544,25 +559,30 @@ Status GemmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldc)},
|
||||
problem_.batch_count * gemm_workspace_.problem_count,
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
gemm_workspace_.Computed = device_context.allocate_tensor(
|
||||
options,
|
||||
"D",
|
||||
operation_desc.D.element,
|
||||
operation_desc.D.layout,
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldc)},
|
||||
problem_.batch_count * gemm_workspace_.problem_count
|
||||
problem_.batch_count * gemm_workspace_.problem_count,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
gemm_workspace_.Reference = device_context.allocate_tensor(
|
||||
options,
|
||||
"Reference",
|
||||
operation_desc.D.element,
|
||||
operation_desc.D.layout,
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldc)},
|
||||
problem_.batch_count * gemm_workspace_.problem_count
|
||||
problem_.batch_count * gemm_workspace_.problem_count,
|
||||
0 // device_index
|
||||
);
|
||||
}
|
||||
|
||||
@ -580,7 +600,7 @@ Status GemmOperationProfiler::initialize_workspace(
|
||||
gemm_workspace_.arguments.batch_stride_D = gemm_workspace_.Computed->batch_stride();
|
||||
|
||||
/* Query device SM count to pass onto the kernel as an argument, where needed */
|
||||
gemm_workspace_.arguments.sm_count = options.device.properties.multiProcessorCount;
|
||||
gemm_workspace_.arguments.sm_count = options.device.properties[0].multiProcessorCount;
|
||||
}
|
||||
|
||||
//
|
||||
@ -596,12 +616,34 @@ Status GemmOperationProfiler::initialize_workspace(
|
||||
|
||||
workspace_size = underlying_operation->get_device_workspace_size(&gemm_workspace_.configuration,
|
||||
&gemm_workspace_.arguments);
|
||||
if (is_sparse) {
|
||||
// sparse gemm get_device_workspace_size() only return device workspace size per iteration
|
||||
// Needs to multiply it w/ number of iteration
|
||||
workspace_size *= gemm_workspace_.problem_count;
|
||||
}
|
||||
gemm_workspace_.device_workspace.reset(library::NumericTypeID::kU8, workspace_size);
|
||||
|
||||
status = underlying_operation->initialize(
|
||||
&gemm_workspace_.configuration,
|
||||
gemm_workspace_.host_workspace.data(),
|
||||
gemm_workspace_.device_workspace.data());
|
||||
// Convert to structure sparse contents here.
|
||||
if (is_sparse) {
|
||||
uint8_t* profiler_workspaces[1];
|
||||
profiler_workspaces[0] = reinterpret_cast<uint8_t*>(gemm_workspace_.A->data());
|
||||
// Sparse operations have a different initialize interface.
|
||||
// initialize_with_profiler_workspace converts mxk tensorA to compressed mxk/sp tensorA and the tensorE
|
||||
auto modifiable_underlying_op = const_cast<library::Operation*>(underlying_operation);
|
||||
status = modifiable_underlying_op->initialize_with_profiler_workspace(
|
||||
&gemm_workspace_.configuration,
|
||||
gemm_workspace_.host_workspace.data(),
|
||||
gemm_workspace_.device_workspace.data(),
|
||||
profiler_workspaces,
|
||||
gemm_workspace_.problem_count);
|
||||
}
|
||||
else {
|
||||
status = underlying_operation->initialize(
|
||||
&gemm_workspace_.configuration,
|
||||
gemm_workspace_.host_workspace.data(),
|
||||
gemm_workspace_.device_workspace.data());
|
||||
}
|
||||
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
@ -821,26 +863,14 @@ bool GemmOperationProfiler::verify_with_cublas_(
|
||||
// Construct cuBLAS operators
|
||||
//
|
||||
|
||||
CublasCreate handle;
|
||||
cublasStatus_t status = handle.get_cublas_create_status();
|
||||
CublasLtCreate handle;
|
||||
cublasStatus_t status = handle.get_cublaslt_create_status();
|
||||
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
|
||||
results_.back().verification_map[library::Provider::kCUBLAS] = get_cutlass_disposition(status);
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<cublasGemmAlgo_t> algorithms;
|
||||
|
||||
detail::select_cublas_algorithms(
|
||||
algorithms,
|
||||
options,
|
||||
gemm_desc);
|
||||
|
||||
if (algorithms.empty()) {
|
||||
// no algorithm selected
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// Initialize state
|
||||
@ -865,29 +895,34 @@ bool GemmOperationProfiler::verify_with_cublas_(
|
||||
gemm_workspace_.arguments.beta = problem_.beta.data();
|
||||
gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
|
||||
|
||||
detail::cublasGemmExDispatcher gemm_op(
|
||||
detail::cublasLtGemmExDispatcher gemm_op(
|
||||
gemm_desc,
|
||||
gemm_workspace_.configuration,
|
||||
gemm_workspace_.arguments,
|
||||
algorithms.front()
|
||||
gemm_workspace_.arguments
|
||||
);
|
||||
|
||||
gemm_op.initialize_cublaslt();
|
||||
|
||||
if(!gemm_op.get_cublaslt_algo(handle, AlgorithmMode::kDefault)){
|
||||
return true;
|
||||
}
|
||||
|
||||
if (gemm_op.status != Status::kSuccess) {
|
||||
results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotRun;
|
||||
return true;
|
||||
}
|
||||
|
||||
results_.back().status = Status::kSuccess;
|
||||
|
||||
status = gemm_op(handle);
|
||||
|
||||
// Handle errors
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
|
||||
std::cerr << "cublasLt Verification run failed with status : " << cublasLtGetStatusName(status) << "\n";
|
||||
results_.back().verification_map[library::Provider::kCUBLAS] = get_cutlass_disposition(status);
|
||||
return true;
|
||||
}
|
||||
|
||||
results_.back().status = Status::kSuccess;
|
||||
|
||||
//
|
||||
// Verify results
|
||||
//
|
||||
@ -930,9 +965,9 @@ bool GemmOperationProfiler::verify_with_reference_(
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem,
|
||||
cutlass::library::NumericTypeID element_A,
|
||||
cutlass::library::NumericTypeID element_B)
|
||||
ProblemSpace::Problem const &problem,
|
||||
cutlass::library::NumericTypeID element_A,
|
||||
cutlass::library::NumericTypeID element_B)
|
||||
{
|
||||
library::GemmDescription const &gemm_desc =
|
||||
static_cast<library::GemmDescription const &>(operation->description());
|
||||
|
||||
@ -376,14 +376,14 @@ int OperationProfiler::profile_all(
|
||||
std::cerr << " @ provider " << operation->description().provider
|
||||
<< " != library::Provider::kCUTLASS\n";
|
||||
}
|
||||
if (options.device.compute_capability() < min_cc) {
|
||||
if (options.device.compute_capability(0) < min_cc) {
|
||||
std::cerr << " @ compute_capability "
|
||||
<< options.device.compute_capability()
|
||||
<< options.device.compute_capability(0)
|
||||
<< " < min_cc " << min_cc << "\n";
|
||||
}
|
||||
if (options.device.compute_capability() > max_cc) {
|
||||
if (options.device.compute_capability(0) > max_cc) {
|
||||
std::cerr << " @ compute_capability "
|
||||
<< options.device.compute_capability()
|
||||
<< options.device.compute_capability(0)
|
||||
<< " > max_cc " << max_cc << "\n";
|
||||
}
|
||||
#endif
|
||||
@ -391,8 +391,8 @@ int OperationProfiler::profile_all(
|
||||
// Execute compatible cutlass operations if they satisfy the current device's compute capability
|
||||
if (operation->description().kind == kind_ &&
|
||||
operation->description().provider == library::Provider::kCUTLASS &&
|
||||
options.device.compute_capability() >= min_cc &&
|
||||
options.device.compute_capability() <= max_cc) {
|
||||
options.device.compute_capability(0) >= min_cc &&
|
||||
options.device.compute_capability(0) <= max_cc) {
|
||||
|
||||
std::string operation_name(operation->description().name);
|
||||
// Filter kernels by name
|
||||
|
||||
@ -33,6 +33,7 @@
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/version.h"
|
||||
@ -55,45 +56,97 @@ static char const *end_of_line = "\n
|
||||
|
||||
Options::Device::Device(cutlass::CommandLine const &cmdline) {
|
||||
|
||||
cmdline.get_cmd_line_argument("device", device, 0);
|
||||
|
||||
// Gets the number of devices for future validation
|
||||
cudaError_t result;
|
||||
result = cudaGetDeviceProperties(&properties, device);
|
||||
|
||||
result = cudaGetDeviceCount(&num_devices);
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaGetDeviceProperties() failed for given device");
|
||||
throw std::runtime_error("cudaGetNumDevices() failed");
|
||||
}
|
||||
|
||||
result = cudaSetDevice(device);
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaSetDevice() failed for given device.");
|
||||
}
|
||||
|
||||
// Permit overriding the compute capability
|
||||
if (cmdline.check_cmd_line_flag("compute-capability")) {
|
||||
int cc = compute_capability();
|
||||
cmdline.get_cmd_line_argument("compute-capability", cc, cc);
|
||||
properties.major = cc / 10;
|
||||
properties.minor = cc % 10;
|
||||
}
|
||||
|
||||
// Permit overriding the L2 cache capacity
|
||||
if (cmdline.check_cmd_line_flag("llc-capacity")) {
|
||||
int llc_capacity = 0;
|
||||
cmdline.get_cmd_line_argument("llc-capacity", llc_capacity, 0);
|
||||
|
||||
if (llc_capacity >= 0) {
|
||||
properties.l2CacheSize = (llc_capacity << 10);
|
||||
// Gets the devices specified by the user
|
||||
// This preserves the user specified order and checks for duplicates
|
||||
{
|
||||
std::vector<int> temp_device_list;
|
||||
cmdline.get_cmd_line_arguments("devices", temp_device_list);
|
||||
if (temp_device_list.empty()) {
|
||||
temp_device_list.push_back(0);
|
||||
}
|
||||
{
|
||||
std::set<int> temp_device_set;
|
||||
for (int device : temp_device_list) {
|
||||
auto res = temp_device_set.insert(device);
|
||||
if (!res.second) {
|
||||
throw std::runtime_error("Duplicate device specified: " +
|
||||
std::to_string(device));
|
||||
} else if (device > num_devices) {
|
||||
throw std::runtime_error("Bad device ID: " +
|
||||
std::to_string(device));
|
||||
} else {
|
||||
devices.push_back(device);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
properties.resize(devices.size());
|
||||
// Retrieves properties for all specified devices
|
||||
for (size_t device_index = 0; device_index < devices.size(); device_index++) {
|
||||
int device = devices[device_index];
|
||||
|
||||
result = cudaGetDeviceProperties(&properties[device_index], device);
|
||||
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaGetDeviceProperties() failed for given device");
|
||||
}
|
||||
|
||||
// Check that all devices are the same
|
||||
if (device_index > 0) {
|
||||
if ((properties[device_index].major != properties[0].major) ||
|
||||
(properties[device_index].minor != properties[0].minor)) {
|
||||
throw std::runtime_error("All selected devices must have the same "
|
||||
"compute capability");
|
||||
}
|
||||
if (properties[device_index].l2CacheSize != properties[0].l2CacheSize) {
|
||||
throw std::runtime_error("All selected devices must have the same "
|
||||
"L2 cache size");
|
||||
}
|
||||
if (properties[device_index].multiProcessorCount != properties[0].multiProcessorCount) {
|
||||
throw std::runtime_error("All selected devices must have the same "
|
||||
"SM count");
|
||||
}
|
||||
}
|
||||
|
||||
result = cudaSetDevice(device);
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaSetDevice() failed for given device.");
|
||||
}
|
||||
|
||||
// Permit overriding the compute capability
|
||||
if (cmdline.check_cmd_line_flag("compute-capability")) {
|
||||
int cc = compute_capability(device_index);
|
||||
cmdline.get_cmd_line_argument("compute-capability", cc, cc);
|
||||
properties[device_index].major = cc / 10;
|
||||
properties[device_index].minor = cc % 10;
|
||||
}
|
||||
|
||||
// Permit overriding the L2 cache capacity
|
||||
if (cmdline.check_cmd_line_flag("llc-capacity")) {
|
||||
int llc_capacity = 0;
|
||||
cmdline.get_cmd_line_argument("llc-capacity", llc_capacity, 0);
|
||||
|
||||
if (llc_capacity >= 0) {
|
||||
properties[device_index].l2CacheSize = (llc_capacity << 10);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void Options::Device::print_usage(std::ostream &out) const {
|
||||
|
||||
out << "Device:\n"
|
||||
<< " --device=<int> "
|
||||
<< " CUDA Device ID\n\n";
|
||||
<< " --devices=<int>,<int>,... "
|
||||
<< " CUDA Device IDs\n\n";
|
||||
|
||||
int device_count = 0;
|
||||
cudaError_t result = cudaGetDeviceCount(&device_count);
|
||||
@ -111,11 +164,11 @@ void Options::Device::print_usage(std::ostream &out) const {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
out << " [" << idx << "] - "
|
||||
<< prop.name << " - SM " << prop.major << "." << prop.minor << ", "
|
||||
<< prop.multiProcessorCount << " SMs @ " << (prop.clockRate / 1000.0) << " MHz, "
|
||||
out << " [" << idx << "] - "
|
||||
<< prop.name << " - SM " << prop.major << "." << prop.minor << ", "
|
||||
<< prop.multiProcessorCount << " SMs @ " << (prop.clockRate / 1000.0) << " MHz, "
|
||||
<< "L2 cache: " << (prop.l2CacheSize >> 20) << " MB, Global Memory: " << (prop.totalGlobalMem >> 30) << " GB"
|
||||
<< std::endl;
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
out << "\n";
|
||||
@ -133,15 +186,8 @@ void Options::Device::print_usage(std::ostream &out) const {
|
||||
}
|
||||
|
||||
void Options::Device::print_device_info(std::ostream &out) const {
|
||||
int num_devices;
|
||||
cudaDeviceProp props;
|
||||
|
||||
cudaError_t result;
|
||||
result = cudaGetDeviceCount(&num_devices);
|
||||
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaGetNumDevices() failed");
|
||||
}
|
||||
|
||||
out << "Device Name,SM,CUDA Device ID,Phy Device ID" << std::endl;
|
||||
|
||||
@ -165,14 +211,28 @@ void Options::Device::print_device_info(std::ostream &out) const {
|
||||
void Options::Device::print_options(std::ostream &out, int indent) const {
|
||||
|
||||
out
|
||||
<< indent_str(indent) << "device: " << device << "\n"
|
||||
<< indent_str(indent) << "clock: " << int(double(properties.clockRate) / 1000.0) << "\n"
|
||||
<< indent_str(indent) << "compute-capability: " << compute_capability() << "\n";
|
||||
<< indent_str(indent) << "devices: ";
|
||||
for (int device : devices) {
|
||||
out << device << ',';
|
||||
}
|
||||
out
|
||||
<< "\n"
|
||||
<< indent_str(indent) << "clock: " << int(double(properties[0].clockRate) / 1000.0) << "\n"
|
||||
<< indent_str(indent) << "compute-capability: " << compute_capability(0) << "\n";
|
||||
}
|
||||
|
||||
/// Returns the device ID from a device index
|
||||
int Options::Device::device_id(size_t device_index) const {
|
||||
if (device_index > devices.size()) {
|
||||
throw std::runtime_error("Out of bounds device index: " +
|
||||
std::to_string(device_index));
|
||||
}
|
||||
return devices.at(device_index);
|
||||
}
|
||||
|
||||
/// Returns the compute capability of the listed device (e.g. 61, 60, 70, 75)
|
||||
int Options::Device::compute_capability() const {
|
||||
return properties.major * 10 + properties.minor;
|
||||
int Options::Device::compute_capability(int device_index) const {
|
||||
return properties[device_index].major * 10 + properties[device_index].minor;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -207,10 +267,10 @@ Options::Initialization::Initialization(cutlass::CommandLine const &cmdline) {
|
||||
else {
|
||||
// profiler chosen data distribution (allowed to change based on numeric types)
|
||||
fix_data_distribution = false;
|
||||
// set uniform data distribution with range [-4, 4]
|
||||
// set uniform data distribution with range [-4, 4]
|
||||
data_distribution.set_uniform(-4, 4, 0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -248,10 +308,10 @@ void Options::Initialization::get_distribution(
|
||||
};
|
||||
|
||||
// Initalize pnz values to a default value of 100%
|
||||
dist.gaussian.pnz = 100.0;
|
||||
dist.gaussian.pnzA = 100.0;
|
||||
dist.gaussian.pnzB = 100.0;
|
||||
dist.gaussian.pnzC = 100.0;
|
||||
dist.gaussian.pnz = 1.0;
|
||||
dist.gaussian.pnzA = 1.0;
|
||||
dist.gaussian.pnzB = 1.0;
|
||||
dist.gaussian.pnzC = 1.0;
|
||||
|
||||
using KeyValueVector = std::vector<std::pair<std::string, std::string> >;
|
||||
|
||||
@ -335,7 +395,7 @@ Options::Library::Library(cutlass::CommandLine const &cmdline) {
|
||||
std::string mode = "default";
|
||||
cmdline.get_cmd_line_argument("library-algo-mode", mode);
|
||||
algorithm_mode = from_string<AlgorithmMode>(mode);
|
||||
}
|
||||
}
|
||||
|
||||
if (cmdline.check_cmd_line_flag("library-algos")) {
|
||||
|
||||
@ -353,7 +413,7 @@ Options::Library::Library(cutlass::CommandLine const &cmdline) {
|
||||
}
|
||||
else {
|
||||
int algo;
|
||||
std::stringstream ss;
|
||||
std::stringstream ss;
|
||||
|
||||
ss << token;
|
||||
ss >> algo;
|
||||
@ -396,12 +456,12 @@ void Options::Library::print_options(std::ostream &out, int indent) const {
|
||||
|
||||
Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {
|
||||
|
||||
cmdline.get_cmd_line_argument("workspace-count", workspace_count, 0);
|
||||
cmdline.get_cmd_line_argument("workspace-count", workspace_count, 0);
|
||||
cmdline.get_cmd_line_argument("warmup-iterations", warmup_iterations, 10);
|
||||
cmdline.get_cmd_line_argument("profiling-iterations", iterations, 100);
|
||||
cmdline.get_cmd_line_argument("sleep-duration", sleep_duration, 50);
|
||||
cmdline.get_cmd_line_argument("profiling-enabled", enabled, true);
|
||||
|
||||
|
||||
if (cmdline.check_cmd_line_flag("providers")) {
|
||||
|
||||
std::vector<std::string> tokens;
|
||||
@ -416,7 +476,7 @@ Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {
|
||||
else {
|
||||
providers.push_back(library::Provider::kCUTLASS);
|
||||
providers.push_back(library::Provider::kCUBLAS);
|
||||
providers.push_back(library::Provider::kCUDNN);
|
||||
providers.push_back(library::Provider::kCUDNN);
|
||||
}
|
||||
}
|
||||
|
||||
@ -480,7 +540,7 @@ size_t Options::Profiling::index(library::Provider provider) const {
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
|
||||
|
||||
|
||||
cmdline.get_cmd_line_argument("verification-enabled", enabled, true);
|
||||
if (enabled) {
|
||||
cmdline.get_cmd_line_argument("verification-required", required, false);
|
||||
@ -500,7 +560,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
|
||||
}
|
||||
|
||||
if (cmdline.check_cmd_line_flag("verification-providers")) {
|
||||
|
||||
|
||||
std::vector<std::string> tokens;
|
||||
cmdline.get_cmd_line_arguments("verification-providers", tokens);
|
||||
|
||||
@ -516,7 +576,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) {
|
||||
else {
|
||||
providers.push_back(library::Provider::kCUBLAS);
|
||||
providers.push_back(library::Provider::kReferenceDevice);
|
||||
providers.push_back(library::Provider::kCUDNN);
|
||||
providers.push_back(library::Provider::kCUDNN);
|
||||
}
|
||||
}
|
||||
|
||||
@ -583,11 +643,11 @@ size_t Options::Verification::index(library::Provider provider) const {
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
Options::Report::Report(cutlass::CommandLine const &cmdline) {
|
||||
|
||||
|
||||
cmdline.get_cmd_line_argument("append", append, false);
|
||||
cmdline.get_cmd_line_argument("output", output_path);
|
||||
cmdline.get_cmd_line_argument("junit-output", junit_output_path);
|
||||
|
||||
|
||||
if (cmdline.check_cmd_line_flag("tags")) {
|
||||
cmdline.get_cmd_line_argument_pairs("tags", pivot_tags);
|
||||
}
|
||||
@ -687,11 +747,11 @@ Options::Options(cutlass::CommandLine const &cmdline):
|
||||
device(cmdline),
|
||||
initialization(cmdline),
|
||||
library(cmdline),
|
||||
profiling(cmdline),
|
||||
verification(cmdline),
|
||||
profiling(cmdline),
|
||||
verification(cmdline),
|
||||
report(cmdline),
|
||||
about(cmdline) {
|
||||
|
||||
|
||||
if (cmdline.check_cmd_line_flag("mode")) {
|
||||
std::string token;
|
||||
cmdline.get_cmd_line_argument("mode", token);
|
||||
|
||||
@ -94,7 +94,7 @@ PerformanceReport::PerformanceReport(
|
||||
if (options_.report.append) {
|
||||
|
||||
std::ifstream test_output_file(op_file_name_);
|
||||
|
||||
|
||||
if (test_output_file.is_open()) {
|
||||
print_header = false;
|
||||
test_output_file.close();
|
||||
@ -145,7 +145,7 @@ void PerformanceReport::append_result(PerformanceResult result) {
|
||||
|
||||
if (options_.report.verbose) {
|
||||
std::cout << "\n";
|
||||
print_result_pretty_(std::cout, result) << std::flush;
|
||||
print_result_pretty_(std::cout, result) << std::flush;
|
||||
}
|
||||
|
||||
if (junit_output_file_.is_open()) {
|
||||
@ -237,7 +237,7 @@ static const char *disposition_status_color(Disposition disposition) {
|
||||
|
||||
/// Prints the result in human readable form
|
||||
std::ostream & PerformanceReport::print_result_pretty_(
|
||||
std::ostream &out,
|
||||
std::ostream &out,
|
||||
PerformanceResult const &result,
|
||||
bool use_shell_coloring) {
|
||||
|
||||
@ -251,14 +251,14 @@ std::ostream & PerformanceReport::print_result_pretty_(
|
||||
int column_idx = 0;
|
||||
for (auto const & tag : options_.report.pivot_tags) {
|
||||
out << (column_idx++ ? "," : "") << tag.first << ":" << tag.second;
|
||||
}
|
||||
}
|
||||
|
||||
out << "\n";
|
||||
}
|
||||
|
||||
std::string shell_color_bright = use_shell_coloring ? SHELL_COLOR_BRIGHT() : "";
|
||||
std::string shell_color_end = use_shell_coloring ? SHELL_COLOR_END() : "";
|
||||
auto _disposition_status_color = [&](Disposition d) -> const char * {
|
||||
auto _disposition_status_color = [&](Disposition d) -> const char * {
|
||||
return use_shell_coloring ? disposition_status_color(d) : "";
|
||||
};
|
||||
|
||||
@ -277,7 +277,7 @@ std::ostream & PerformanceReport::print_result_pretty_(
|
||||
static int const indent_spaces = 16;
|
||||
|
||||
for(auto & m : result.verification_map) {
|
||||
out << std::right << std::setw(indent_spaces) << library::to_string(m.first, true) << ": " << to_string(m.second, true) << "\n";
|
||||
out << std::right << std::setw(indent_spaces) << library::to_string(m.first, true) << ": " << to_string(m.second, true) << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
@ -287,7 +287,7 @@ std::ostream & PerformanceReport::print_result_pretty_(
|
||||
int column_idx = 0;
|
||||
for (auto const &arg : result.arguments) {
|
||||
if (!arg.second.empty()) {
|
||||
out << " --" << arg.first << "=" << arg.second;
|
||||
out << " --" << arg.first << "=" << arg.second;
|
||||
column_idx += int(4 + arg.first.size() + arg.second.size());
|
||||
if (column_idx > 98) {
|
||||
out << " \\\n ";
|
||||
@ -297,7 +297,7 @@ std::ostream & PerformanceReport::print_result_pretty_(
|
||||
}
|
||||
out << "\n\n";
|
||||
|
||||
out
|
||||
out
|
||||
<< " Bytes: " << result.bytes << " bytes\n"
|
||||
<< " FLOPs: " << result.flops << " flops\n"
|
||||
<< " FLOPs/Byte: " << (result.flops / result.bytes) << "\n\n";
|
||||
@ -325,7 +325,7 @@ std::ostream & PerformanceReport::print_csv_header_(
|
||||
out << (column_idx++ ? "," : "") << tag.first;
|
||||
}
|
||||
|
||||
out
|
||||
out
|
||||
<< (column_idx ? "," : "") << "Problem,Provider"
|
||||
<< ",OperationKind,Operation,Disposition,Status";
|
||||
|
||||
@ -333,7 +333,7 @@ std::ostream & PerformanceReport::print_csv_header_(
|
||||
out << "," << arg_name;
|
||||
}
|
||||
|
||||
out
|
||||
out
|
||||
<< ",Bytes"
|
||||
<< ",Flops"
|
||||
<< ",Flops/Byte"
|
||||
@ -347,7 +347,7 @@ std::ostream & PerformanceReport::print_csv_header_(
|
||||
|
||||
/// Print the result in CSV output
|
||||
std::ostream & PerformanceReport::print_result_csv_(
|
||||
std::ostream &out,
|
||||
std::ostream &out,
|
||||
PerformanceResult const &result) {
|
||||
|
||||
int column_idx = 0;
|
||||
@ -357,8 +357,8 @@ std::ostream & PerformanceReport::print_result_csv_(
|
||||
out << (column_idx++ ? "," : "") << tag.second;
|
||||
}
|
||||
|
||||
out
|
||||
<< (column_idx ? "," : "")
|
||||
out
|
||||
<< (column_idx ? "," : "")
|
||||
<< result.problem_index
|
||||
<< "," << to_string(result.provider, true)
|
||||
<< "," << to_string(result.op_kind)
|
||||
@ -370,7 +370,7 @@ std::ostream & PerformanceReport::print_result_csv_(
|
||||
out << "," << arg.second;
|
||||
}
|
||||
|
||||
out
|
||||
out
|
||||
<< "," << result.bytes
|
||||
<< "," << result.flops
|
||||
<< "," << result.flops / result.bytes
|
||||
@ -387,7 +387,7 @@ std::ostream & PerformanceReport::print_result_csv_(
|
||||
else {
|
||||
out << std::string(2
|
||||
, ','
|
||||
);
|
||||
);
|
||||
}
|
||||
|
||||
return out;
|
||||
@ -451,25 +451,25 @@ std::ostream & PerformanceReport::print_junit_result_(std::ostream &out, Perform
|
||||
case Disposition::kNotSupported:
|
||||
skipped = true;
|
||||
break;
|
||||
case Disposition::kPassed:
|
||||
case Disposition::kPassed:
|
||||
case Disposition::kNotVerified:
|
||||
break;
|
||||
case Disposition::kFailed:
|
||||
case Disposition::kFailed:
|
||||
case Disposition::kIncorrect:
|
||||
failed = true;
|
||||
failed = true;
|
||||
break;
|
||||
case Disposition::kInvalidProblem:
|
||||
case Disposition::kInvalid:
|
||||
error = true;
|
||||
break;
|
||||
};
|
||||
|
||||
|
||||
if (skipped) {
|
||||
out << "status=\"notrun\"";
|
||||
} else {
|
||||
out << "status=\"run\"";
|
||||
}
|
||||
|
||||
|
||||
out << ">" << std::endl;
|
||||
|
||||
if (failed) {
|
||||
@ -488,7 +488,7 @@ std::ostream & PerformanceReport::print_junit_result_(std::ostream &out, Perform
|
||||
|
||||
out << " </testcase>" << std::endl;
|
||||
|
||||
return out;
|
||||
return out;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -31,7 +31,7 @@
|
||||
/* \file
|
||||
\brief Execution environment
|
||||
|
||||
|
||||
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
@ -54,7 +54,7 @@ namespace profiler {
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Ctor
|
||||
Rank2KOperationProfiler::Rank2KOperationProfiler(Options const &options):
|
||||
Rank2KOperationProfiler::Rank2KOperationProfiler(Options const &options):
|
||||
OperationProfiler(
|
||||
options,
|
||||
library::OperationKind::kRank2K,
|
||||
@ -95,7 +95,7 @@ void Rank2KOperationProfiler::print_examples(std::ostream &out) const {
|
||||
out << "\nExamples:\n\n"
|
||||
<< "Profile a particular problem size Syrk kernel:\n"
|
||||
<< " $ cutlass_profiler --operation=rank_2k --blas_mode=symmetric --n=1024 --k=128\n\n"
|
||||
|
||||
|
||||
<< "Profile a particular problem size Herk kernel:\n"
|
||||
<< " $ cutlass_profiler --operation=rank_2k --blas_mode=hermitian --n=1024 --k=128\n\n"
|
||||
|
||||
@ -118,7 +118,7 @@ void Rank2KOperationProfiler::print_examples(std::ostream &out) const {
|
||||
|
||||
<< "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
|
||||
<< " $ cutlass_profiler --operation=rank_2k --cta_m=256 --cta_n=128 --cta_k=32 --save-workspace=incorrect\n\n"
|
||||
|
||||
|
||||
<< "Test your changes to rank_2k kernels with a quick functional test and save results in functional-test.csv:\n"
|
||||
<< " $ cutlass_profiler --operation=rank_2k \\ \n"
|
||||
<< " --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
|
||||
@ -148,22 +148,22 @@ Status Rank2KOperationProfiler::RankKProblem::parse(
|
||||
library::RankKDescription const &operation_desc,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
|
||||
if (!arg_as_int(this->n, "n", problem_space, problem)) {
|
||||
// default value
|
||||
this->n = 1024;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->k, "k", problem_space, problem)) {
|
||||
// default value
|
||||
this->k = 1024;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
|
||||
// default value
|
||||
this->split_k_slices = 1;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
|
||||
// default value
|
||||
this->batch_count = 1;
|
||||
@ -187,29 +187,29 @@ Status Rank2KOperationProfiler::RankKProblem::parse(
|
||||
}
|
||||
|
||||
if (!arg_as_scalar(
|
||||
this->alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
this->alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_scalar(
|
||||
this->beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
this->beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
|
||||
if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
this->lda = DeviceAllocation::get_packed_layout(
|
||||
operation_desc.A.layout, {int(this->n), int(this->k)}).front();
|
||||
|
||||
@ -311,14 +311,14 @@ void Rank2KOperationProfiler::RankKProblem::initialize_result(
|
||||
|
||||
/// Extracts the problem dimensions
|
||||
Status Rank2KOperationProfiler::initialize_configuration(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::RankKDescription const &operation_desc =
|
||||
library::RankKDescription const &operation_desc =
|
||||
static_cast<library::RankKDescription const &>(operation->description());
|
||||
|
||||
if (operation_desc.rank_k_kind != library::RankKKind::kUniversal) {
|
||||
@ -326,7 +326,7 @@ Status Rank2KOperationProfiler::initialize_configuration(
|
||||
}
|
||||
|
||||
Status status = problem_.parse(operation_desc, problem_space, problem);
|
||||
|
||||
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
@ -350,14 +350,14 @@ Status Rank2KOperationProfiler::initialize_configuration(
|
||||
rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
|
||||
|
||||
initialize_result_(this->model_result_, options, operation_desc, problem_space);
|
||||
|
||||
|
||||
return operation->can_implement(&rank_k_workspace_.configuration, &rank_k_workspace_.arguments);
|
||||
}
|
||||
|
||||
/// Initializes the performance result
|
||||
void Rank2KOperationProfiler::initialize_result_(
|
||||
PerformanceResult &result,
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
library::RankKDescription const &operation_desc,
|
||||
ProblemSpace const &problem_space) {
|
||||
|
||||
@ -365,7 +365,7 @@ void Rank2KOperationProfiler::initialize_result_(
|
||||
result.disposition = Disposition::kNotRun;
|
||||
result.status = Status::kSuccess;
|
||||
result.operation_name = operation_desc.name;
|
||||
|
||||
|
||||
problem_.initialize_result(result, operation_desc, problem_space);
|
||||
|
||||
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
|
||||
@ -380,19 +380,30 @@ void Rank2KOperationProfiler::initialize_result_(
|
||||
|
||||
/// Initializes workspace
|
||||
Status Rank2KOperationProfiler::initialize_workspace(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::RankKDescription const &operation_desc =
|
||||
|
||||
if (options.device.devices.size() != 1) {
|
||||
throw std::runtime_error("This operation profiler only supports a single "
|
||||
"device.");
|
||||
}
|
||||
|
||||
cudaError_t result;
|
||||
result = cudaSetDevice(options.device.device_id(0));
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaSetDevice() failed.");
|
||||
}
|
||||
|
||||
library::RankKDescription const &operation_desc =
|
||||
static_cast<library::RankKDescription const &>(operation->description());
|
||||
|
||||
if (options.execution_mode != ExecutionMode::kDryRun) {
|
||||
int seed_shift = 0;
|
||||
rank_k_workspace_.A = device_context.allocate_tensor(
|
||||
rank_k_workspace_.A = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"A",
|
||||
operation_desc.A.element,
|
||||
@ -400,10 +411,11 @@ Status Rank2KOperationProfiler::initialize_workspace(
|
||||
{int(problem_.n), int(problem_.k)},
|
||||
{int(problem_.lda)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
rank_k_workspace_.B = device_context.allocate_tensor(
|
||||
rank_k_workspace_.B = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"B",
|
||||
operation_desc.B.element,
|
||||
@ -411,10 +423,11 @@ Status Rank2KOperationProfiler::initialize_workspace(
|
||||
{int(problem_.n), int(problem_.k)},
|
||||
{int(problem_.ldb)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
rank_k_workspace_.C = device_context.allocate_tensor(
|
||||
rank_k_workspace_.C = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"C",
|
||||
operation_desc.C.element,
|
||||
@ -422,23 +435,30 @@ Status Rank2KOperationProfiler::initialize_workspace(
|
||||
{int(problem_.n), int(problem_.n)},
|
||||
{int(problem_.ldc)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
rank_k_workspace_.Computed = device_context.allocate_tensor(
|
||||
options,
|
||||
"D",
|
||||
operation_desc.C.element,
|
||||
operation_desc.C.layout,
|
||||
{int(problem_.n), int(problem_.n)},
|
||||
{int(problem_.ldc)}
|
||||
{int(problem_.ldc)},
|
||||
1, // batch_count
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
rank_k_workspace_.Reference = device_context.allocate_tensor(
|
||||
options,
|
||||
"Reference",
|
||||
operation_desc.C.element,
|
||||
operation_desc.C.layout,
|
||||
{int(problem_.n), int(problem_.n)},
|
||||
{int(problem_.ldc)}
|
||||
{int(problem_.ldc)},
|
||||
1, // batch_count
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
rank_k_workspace_.Computed->copy_from_device(rank_k_workspace_.C->data());
|
||||
@ -487,7 +507,7 @@ Status Rank2KOperationProfiler::initialize_workspace(
|
||||
|
||||
/// Verifies CUTLASS against references
|
||||
bool Rank2KOperationProfiler::verify_cutlass(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -516,7 +536,7 @@ bool Rank2KOperationProfiler::verify_cutlass(
|
||||
//
|
||||
|
||||
results_.back().status = operation->run(
|
||||
&rank_k_workspace_.arguments,
|
||||
&rank_k_workspace_.arguments,
|
||||
rank_k_workspace_.host_workspace.data(),
|
||||
rank_k_workspace_.device_workspace.data());
|
||||
|
||||
@ -564,8 +584,8 @@ bool Rank2KOperationProfiler::verify_cutlass(
|
||||
}
|
||||
}
|
||||
#endif // #if CUTLASS_ENABLE_CUBLAS
|
||||
|
||||
// Update disposition to worst case verification outcome among all
|
||||
|
||||
// Update disposition to worst case verification outcome among all
|
||||
// verification providers which are supported
|
||||
bool is_any_verification_run_passed = false;
|
||||
for(auto &m : results_.back().verification_map) {
|
||||
@ -591,7 +611,7 @@ bool Rank2KOperationProfiler::verify_cutlass(
|
||||
|
||||
/// Verifies CUTLASS against references
|
||||
bool Rank2KOperationProfiler::verify_with_cublas_(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -601,13 +621,13 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
|
||||
|
||||
#if CUTLASS_ENABLE_CUBLAS
|
||||
|
||||
library::RankKDescription const &rank_k_desc =
|
||||
library::RankKDescription const &rank_k_desc =
|
||||
static_cast<library::RankKDescription const &>(operation->description());
|
||||
|
||||
//
|
||||
// Construct cuBLAS operators
|
||||
//
|
||||
|
||||
|
||||
CublasCreate handle;
|
||||
cublasStatus_t status = handle.get_cublas_create_status();
|
||||
|
||||
@ -636,8 +656,8 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
|
||||
rank_k_workspace_.arguments.beta = problem_.beta.data();
|
||||
rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
|
||||
|
||||
detail::cublasRankKDispatcher rank_k_op(
|
||||
rank_k_desc,
|
||||
detail::cublasRankKDispatcher rank_k_op(
|
||||
rank_k_desc,
|
||||
rank_k_workspace_.configuration,
|
||||
rank_k_workspace_.arguments
|
||||
);
|
||||
@ -669,7 +689,7 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
|
||||
);
|
||||
|
||||
// Save workspace if incorrect
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
|
||||
|
||||
save_workspace(
|
||||
@ -694,7 +714,7 @@ bool Rank2KOperationProfiler::verify_with_cublas_(
|
||||
|
||||
/// Measures performance results
|
||||
bool Rank2KOperationProfiler::profile(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
|
||||
@ -31,7 +31,7 @@
|
||||
/* \file
|
||||
\brief Execution environment
|
||||
|
||||
|
||||
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
@ -54,7 +54,7 @@ namespace profiler {
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Ctor
|
||||
RankKOperationProfiler::RankKOperationProfiler(Options const &options):
|
||||
RankKOperationProfiler::RankKOperationProfiler(Options const &options):
|
||||
OperationProfiler(
|
||||
options,
|
||||
library::OperationKind::kRankK,
|
||||
@ -94,7 +94,7 @@ void RankKOperationProfiler::print_examples(std::ostream &out) const {
|
||||
out << "\nExamples:\n\n"
|
||||
<< "Profile a particular problem size Syrk kernel:\n"
|
||||
<< " $ cutlass_profiler --operation=rank_k --blas_mode=symmetric --n=1024 --k=128\n\n"
|
||||
|
||||
|
||||
<< "Profile a particular problem size Herk kernel:\n"
|
||||
<< " $ cutlass_profiler --operation=rank_k --blas_mode=hermitian --n=1024 --k=128\n\n"
|
||||
|
||||
@ -117,7 +117,7 @@ void RankKOperationProfiler::print_examples(std::ostream &out) const {
|
||||
|
||||
<< "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
|
||||
<< " $ cutlass_profiler --operation=rank_k --cta_m=256 --cta_n=128 --cta_k=32 --save-workspace=incorrect\n\n"
|
||||
|
||||
|
||||
<< "Test your changes to rank_k kernels with a quick functional test and save results in functional-test.csv:\n"
|
||||
<< " $ cutlass_profiler --operation=rank_k \\ \n"
|
||||
<< " --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
|
||||
@ -147,22 +147,22 @@ Status RankKOperationProfiler::RankKProblem::parse(
|
||||
library::RankKDescription const &operation_desc,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
|
||||
if (!arg_as_int(this->n, "n", problem_space, problem)) {
|
||||
// default value
|
||||
this->n = 1024;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->k, "k", problem_space, problem)) {
|
||||
// default value
|
||||
this->k = 1024;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
|
||||
// default value
|
||||
this->split_k_slices = 1;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
|
||||
// default value
|
||||
this->batch_count = 1;
|
||||
@ -182,29 +182,29 @@ Status RankKOperationProfiler::RankKProblem::parse(
|
||||
}
|
||||
|
||||
if (!arg_as_scalar(
|
||||
this->alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
this->alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_scalar(
|
||||
this->beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
this->beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
|
||||
if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
this->lda = DeviceAllocation::get_packed_layout(
|
||||
operation_desc.A.layout, {int(this->n), int(this->k)}).front();
|
||||
|
||||
@ -252,7 +252,7 @@ int64_t RankKOperationProfiler::RankKProblem::flops(library::RankKDescription co
|
||||
case library::MathOperationID::kMultiplyAddComplexFastF32:
|
||||
flops_ *= 4;
|
||||
break;
|
||||
|
||||
|
||||
case library::MathOperationID::kMultiplyAddGaussianComplex:
|
||||
flops_ *= 3;
|
||||
break;
|
||||
@ -300,14 +300,14 @@ void RankKOperationProfiler::RankKProblem::initialize_result(
|
||||
|
||||
/// Extracts the problem dimensions
|
||||
Status RankKOperationProfiler::initialize_configuration(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::RankKDescription const &operation_desc =
|
||||
library::RankKDescription const &operation_desc =
|
||||
static_cast<library::RankKDescription const &>(operation->description());
|
||||
|
||||
if (operation_desc.rank_k_kind != library::RankKKind::kUniversal) {
|
||||
@ -315,7 +315,7 @@ Status RankKOperationProfiler::initialize_configuration(
|
||||
}
|
||||
|
||||
Status status = problem_.parse(operation_desc, problem_space, problem);
|
||||
|
||||
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
@ -337,14 +337,14 @@ Status RankKOperationProfiler::initialize_configuration(
|
||||
rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
|
||||
|
||||
initialize_result_(this->model_result_, options, operation_desc, problem_space);
|
||||
|
||||
|
||||
return operation->can_implement(&rank_k_workspace_.configuration, &rank_k_workspace_.arguments);
|
||||
}
|
||||
|
||||
/// Initializes the performance result
|
||||
void RankKOperationProfiler::initialize_result_(
|
||||
PerformanceResult &result,
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
library::RankKDescription const &operation_desc,
|
||||
ProblemSpace const &problem_space) {
|
||||
|
||||
@ -352,7 +352,7 @@ void RankKOperationProfiler::initialize_result_(
|
||||
result.disposition = Disposition::kNotRun;
|
||||
result.status = Status::kSuccess;
|
||||
result.operation_name = operation_desc.name;
|
||||
|
||||
|
||||
problem_.initialize_result(result, operation_desc, problem_space);
|
||||
|
||||
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
|
||||
@ -368,7 +368,7 @@ void RankKOperationProfiler::initialize_result_(
|
||||
case library::MathOperationID::kMultiplyAddComplex:
|
||||
result.flops *= 4;
|
||||
break;
|
||||
|
||||
|
||||
case library::MathOperationID::kMultiplyAddComplexFastF32:
|
||||
result.flops *= 4;
|
||||
break;
|
||||
@ -380,19 +380,30 @@ void RankKOperationProfiler::initialize_result_(
|
||||
|
||||
/// Initializes workspace
|
||||
Status RankKOperationProfiler::initialize_workspace(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::RankKDescription const &operation_desc =
|
||||
|
||||
if (options.device.devices.size() != 1) {
|
||||
throw std::runtime_error("This operation profiler only supports a single "
|
||||
"device.");
|
||||
}
|
||||
|
||||
cudaError_t result;
|
||||
result = cudaSetDevice(options.device.device_id(0));
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaSetDevice() failed.");
|
||||
}
|
||||
|
||||
library::RankKDescription const &operation_desc =
|
||||
static_cast<library::RankKDescription const &>(operation->description());
|
||||
|
||||
if (options.execution_mode != ExecutionMode::kDryRun) {
|
||||
int seed_shift = 0;
|
||||
rank_k_workspace_.A = device_context.allocate_tensor(
|
||||
rank_k_workspace_.A = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"A",
|
||||
operation_desc.A.element,
|
||||
@ -400,10 +411,11 @@ Status RankKOperationProfiler::initialize_workspace(
|
||||
{int(problem_.n), int(problem_.k)},
|
||||
{int(problem_.lda)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
rank_k_workspace_.C = device_context.allocate_tensor(
|
||||
rank_k_workspace_.C = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"C",
|
||||
operation_desc.C.element,
|
||||
@ -411,23 +423,30 @@ Status RankKOperationProfiler::initialize_workspace(
|
||||
{int(problem_.n), int(problem_.n)},
|
||||
{int(problem_.ldc)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
rank_k_workspace_.Computed = device_context.allocate_tensor(
|
||||
options,
|
||||
"D",
|
||||
operation_desc.C.element,
|
||||
operation_desc.C.layout,
|
||||
{int(problem_.n), int(problem_.n)},
|
||||
{int(problem_.ldc)}
|
||||
{int(problem_.ldc)},
|
||||
1, //batch_count
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
rank_k_workspace_.Reference = device_context.allocate_tensor(
|
||||
options,
|
||||
"Reference",
|
||||
operation_desc.C.element,
|
||||
operation_desc.C.layout,
|
||||
{int(problem_.n), int(problem_.n)},
|
||||
{int(problem_.ldc)}
|
||||
{int(problem_.ldc)},
|
||||
1, //batch_count
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
rank_k_workspace_.Computed->copy_from_device(rank_k_workspace_.C->data());
|
||||
@ -476,7 +495,7 @@ Status RankKOperationProfiler::initialize_workspace(
|
||||
|
||||
/// Verifies CUTLASS against references
|
||||
bool RankKOperationProfiler::verify_cutlass(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -504,7 +523,7 @@ bool RankKOperationProfiler::verify_cutlass(
|
||||
//
|
||||
|
||||
results_.back().status = operation->run(
|
||||
&rank_k_workspace_.arguments,
|
||||
&rank_k_workspace_.arguments,
|
||||
rank_k_workspace_.host_workspace.data(),
|
||||
rank_k_workspace_.device_workspace.data());
|
||||
|
||||
@ -552,8 +571,8 @@ bool RankKOperationProfiler::verify_cutlass(
|
||||
}
|
||||
}
|
||||
#endif // #if CUTLASS_ENABLE_CUBLAS
|
||||
|
||||
// Update disposition to worst case verification outcome among all
|
||||
|
||||
// Update disposition to worst case verification outcome among all
|
||||
// verification providers which are supported
|
||||
bool is_any_verification_run_passed = false;
|
||||
for(auto &m : results_.back().verification_map) {
|
||||
@ -579,7 +598,7 @@ bool RankKOperationProfiler::verify_cutlass(
|
||||
|
||||
/// Verifies CUTLASS against references
|
||||
bool RankKOperationProfiler::verify_with_cublas_(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -589,13 +608,13 @@ bool RankKOperationProfiler::verify_with_cublas_(
|
||||
|
||||
#if CUTLASS_ENABLE_CUBLAS
|
||||
|
||||
library::RankKDescription const &rank_k_desc =
|
||||
library::RankKDescription const &rank_k_desc =
|
||||
static_cast<library::RankKDescription const &>(operation->description());
|
||||
|
||||
//
|
||||
// Construct cuBLAS operators
|
||||
//
|
||||
|
||||
|
||||
CublasCreate handle;
|
||||
cublasStatus_t status = handle.get_cublas_create_status();
|
||||
|
||||
@ -623,8 +642,8 @@ bool RankKOperationProfiler::verify_with_cublas_(
|
||||
rank_k_workspace_.arguments.beta = problem_.beta.data();
|
||||
rank_k_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
|
||||
|
||||
detail::cublasRankKDispatcher rank_k_op(
|
||||
rank_k_desc,
|
||||
detail::cublasRankKDispatcher rank_k_op(
|
||||
rank_k_desc,
|
||||
rank_k_workspace_.configuration,
|
||||
rank_k_workspace_.arguments
|
||||
);
|
||||
@ -656,7 +675,7 @@ bool RankKOperationProfiler::verify_with_cublas_(
|
||||
);
|
||||
|
||||
// Save workspace if incorrect
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
|
||||
|
||||
save_workspace(
|
||||
@ -681,7 +700,7 @@ bool RankKOperationProfiler::verify_with_cublas_(
|
||||
|
||||
/// Measures performance results
|
||||
bool RankKOperationProfiler::profile(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
|
||||
@ -51,23 +51,23 @@ namespace profiler {
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Ctor
|
||||
SparseGemmOperationProfiler::SparseGemmOperationProfiler(Options const &options):
|
||||
SparseGemmOperationProfiler::SparseGemmOperationProfiler(Options const &options):
|
||||
OperationProfiler(
|
||||
options,
|
||||
library::OperationKind::kSparseGemm,
|
||||
{
|
||||
{ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (e.g. sparse, ...)"},
|
||||
{ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"},
|
||||
{ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"},
|
||||
{ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"},
|
||||
{ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
|
||||
{ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
|
||||
{ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
|
||||
{ArgumentTypeID::kTensor, {"E"}, "Tensor storing the E operand"},
|
||||
{ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
|
||||
{ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
|
||||
{ArgumentTypeID::kInteger, {"split_k_slices"}, "Number of partitions of K dimension"},
|
||||
{ArgumentTypeID::kInteger, {"batch_count"}, "Number of GEMMs computed in one batch"},
|
||||
{ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (e.g. sparse, ...)"},
|
||||
{ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"},
|
||||
{ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"},
|
||||
{ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"},
|
||||
{ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
|
||||
{ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
|
||||
{ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
|
||||
{ArgumentTypeID::kTensor, {"E"}, "Tensor storing the E operand"},
|
||||
{ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
|
||||
{ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
|
||||
{ArgumentTypeID::kInteger, {"split_k_slices"}, "Number of partitions of K dimension"},
|
||||
{ArgumentTypeID::kInteger, {"batch_count"}, "Number of GEMMs computed in one batch"},
|
||||
}
|
||||
) {
|
||||
|
||||
@ -109,7 +109,7 @@ void SparseGemmOperationProfiler::print_examples(std::ostream &out) const {
|
||||
|
||||
<< "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
|
||||
<< " $ cutlass_profiler --operation=SparseGemm --cta_m=256 --cta_n=128 --cta_k=32 --save-workspace=incorrect\n\n"
|
||||
|
||||
|
||||
<< "Test your changes to gemm kernels with a quick functional test and save results in functional-test.csv:\n"
|
||||
<< " $ cutlass_profiler --operation=SparseGemm \\ \n"
|
||||
<< " --m=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
|
||||
@ -125,7 +125,7 @@ Status SparseGemmOperationProfiler::SparseGemmProblem::parse(
|
||||
library::SparseGemmDescription const &operation_desc,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
|
||||
if (!arg_as_int(this->m, "m", problem_space, problem)) {
|
||||
// default value
|
||||
this->m = 1024;
|
||||
@ -135,17 +135,17 @@ Status SparseGemmOperationProfiler::SparseGemmProblem::parse(
|
||||
// default value
|
||||
this->n = 1024;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->k, "k", problem_space, problem)) {
|
||||
// default value
|
||||
this->k = 1024;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
|
||||
// default value
|
||||
this->split_k_slices = 1;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
|
||||
// default value
|
||||
this->batch_count = 1;
|
||||
@ -168,24 +168,24 @@ Status SparseGemmOperationProfiler::SparseGemmProblem::parse(
|
||||
}
|
||||
|
||||
if (!arg_as_scalar(
|
||||
this->alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
this->alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_scalar(
|
||||
this->beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
this->beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
|
||||
if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
@ -252,14 +252,14 @@ void SparseGemmOperationProfiler::SparseGemmProblem::initialize_result(
|
||||
|
||||
/// Extracts the problem dimensions
|
||||
Status SparseGemmOperationProfiler::initialize_configuration(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::SparseGemmDescription const &operation_desc =
|
||||
library::SparseGemmDescription const &operation_desc =
|
||||
static_cast<library::SparseGemmDescription const &>(operation->description());
|
||||
|
||||
if (operation_desc.gemm_kind != library::GemmKind::kSparse) {
|
||||
@ -291,14 +291,14 @@ Status SparseGemmOperationProfiler::initialize_configuration(
|
||||
gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
|
||||
|
||||
initialize_result_(this->model_result_, options, operation_desc, problem_space);
|
||||
|
||||
|
||||
return operation->can_implement(&gemm_workspace_.configuration, &gemm_workspace_.arguments);
|
||||
}
|
||||
|
||||
/// Initializes the performance result
|
||||
void SparseGemmOperationProfiler::initialize_result_(
|
||||
PerformanceResult &result,
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
library::SparseGemmDescription const &operation_desc,
|
||||
ProblemSpace const &problem_space) {
|
||||
|
||||
@ -308,7 +308,7 @@ void SparseGemmOperationProfiler::initialize_result_(
|
||||
result.operation_name = operation_desc.name;
|
||||
|
||||
problem_.initialize_result(result, operation_desc, problem_space);
|
||||
|
||||
|
||||
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
|
||||
|
||||
// Input bytes read and Output bytes written for the gemm problem
|
||||
@ -337,19 +337,30 @@ void SparseGemmOperationProfiler::initialize_result_(
|
||||
|
||||
/// Initializes workspace
|
||||
Status SparseGemmOperationProfiler::initialize_workspace(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::SparseGemmDescription const &operation_desc =
|
||||
|
||||
if (options.device.devices.size() != 1) {
|
||||
throw std::runtime_error("This operation profiler only supports a single "
|
||||
"device.");
|
||||
}
|
||||
|
||||
cudaError_t result;
|
||||
result = cudaSetDevice(options.device.device_id(0));
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaSetDevice() failed.");
|
||||
}
|
||||
|
||||
library::SparseGemmDescription const &operation_desc =
|
||||
static_cast<library::SparseGemmDescription const &>(operation->description());
|
||||
|
||||
if (options.execution_mode != ExecutionMode::kDryRun) {
|
||||
int seed_shift = 0;
|
||||
gemm_workspace_.A = device_context.allocate_tensor(
|
||||
gemm_workspace_.A = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"A",
|
||||
operation_desc.A.element,
|
||||
@ -357,10 +368,11 @@ Status SparseGemmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.m), int(problem_.k) / int(problem_.sparse)},
|
||||
{int(problem_.lda)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
gemm_workspace_.B = device_context.allocate_tensor(
|
||||
gemm_workspace_.B = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"B",
|
||||
operation_desc.B.element,
|
||||
@ -368,10 +380,11 @@ Status SparseGemmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.k), int(problem_.n)},
|
||||
{int(problem_.ldb)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
gemm_workspace_.C = device_context.allocate_tensor(
|
||||
gemm_workspace_.C = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"C",
|
||||
operation_desc.C.element,
|
||||
@ -379,18 +392,22 @@ Status SparseGemmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldc)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
gemm_workspace_.Computed = device_context.allocate_tensor(
|
||||
options,
|
||||
"D",
|
||||
operation_desc.C.element,
|
||||
operation_desc.C.layout,
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldc)}
|
||||
{int(problem_.ldc)},
|
||||
1, // batch_count
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
gemm_workspace_.E = device_context.allocate_sparsemeta_tensor(
|
||||
gemm_workspace_.E = device_context.allocate_and_initialize_sparsemeta_tensor(
|
||||
options,
|
||||
"E",
|
||||
operation_desc.E.element,
|
||||
@ -399,15 +416,19 @@ Status SparseGemmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.m), int(problem_.k) / int(problem_.sparse) / int(problem_.elements_per_128b)},
|
||||
{int(problem_.lde)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
gemm_workspace_.Reference = device_context.allocate_tensor(
|
||||
options,
|
||||
"Reference",
|
||||
operation_desc.C.element,
|
||||
operation_desc.C.layout,
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldc)}
|
||||
{int(problem_.ldc)},
|
||||
1, // batch_count
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
gemm_workspace_.Reference->copy_from_device(gemm_workspace_.C->data());
|
||||
@ -456,7 +477,7 @@ Status SparseGemmOperationProfiler::initialize_workspace(
|
||||
|
||||
/// Verifies CUTLASS against references
|
||||
bool SparseGemmOperationProfiler::verify_cutlass(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -486,7 +507,7 @@ bool SparseGemmOperationProfiler::verify_cutlass(
|
||||
//
|
||||
|
||||
results_.back().status = operation->run(
|
||||
&gemm_workspace_.arguments,
|
||||
&gemm_workspace_.arguments,
|
||||
gemm_workspace_.host_workspace.data(),
|
||||
gemm_workspace_.device_workspace.data());
|
||||
|
||||
@ -510,7 +531,7 @@ bool SparseGemmOperationProfiler::verify_cutlass(
|
||||
|
||||
if (options.verification.enabled) {
|
||||
|
||||
// Update disposition to worst case verification outcome among all
|
||||
// Update disposition to worst case verification outcome among all
|
||||
// verification providers which are supported
|
||||
bool is_any_verification_run_passed = false;
|
||||
|
||||
@ -537,7 +558,7 @@ bool SparseGemmOperationProfiler::verify_cutlass(
|
||||
|
||||
/// Measures performance results
|
||||
bool SparseGemmOperationProfiler::profile(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -565,7 +586,7 @@ bool SparseGemmOperationProfiler::profile(
|
||||
gemm_workspace_.device_workspace.data()
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -31,7 +31,7 @@
|
||||
/* \file
|
||||
\brief Execution environment
|
||||
|
||||
|
||||
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
@ -54,7 +54,7 @@ namespace profiler {
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Ctor
|
||||
SymmOperationProfiler::SymmOperationProfiler(Options const &options):
|
||||
SymmOperationProfiler::SymmOperationProfiler(Options const &options):
|
||||
OperationProfiler(
|
||||
options,
|
||||
library::OperationKind::kSymm,
|
||||
@ -96,7 +96,7 @@ void SymmOperationProfiler::print_examples(std::ostream &out) const {
|
||||
out << "\nExamples:\n\n"
|
||||
<< "Profile a particular problem size SYMM kernel:\n"
|
||||
<< " $ cutlass_profiler --operation=Symm --blas_mode=symmetric --m=1024 --n=128\n\n"
|
||||
|
||||
|
||||
<< "Profile a particular problem size HEMM kernel:\n"
|
||||
<< " $ cutlass_profiler --operation=Symm --blas_mode=hermitian --m=1024 --n=128\n\n"
|
||||
|
||||
@ -122,7 +122,7 @@ void SymmOperationProfiler::print_examples(std::ostream &out) const {
|
||||
|
||||
<< "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
|
||||
<< " $ cutlass_profiler --operation=Symm --cta_m=256 --cta_n=128 --cta_k=32 --save-workspace=incorrect\n\n"
|
||||
|
||||
|
||||
<< "Test your changes to symm kernels with a quick functional test and save results in functional-test.csv:\n"
|
||||
<< " $ cutlass_profiler --operation=Symm \\ \n"
|
||||
<< " --m=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
|
||||
@ -152,22 +152,22 @@ Status SymmOperationProfiler::SymmProblem::parse(
|
||||
library::SymmDescription const &operation_desc,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
|
||||
if (!arg_as_int(this->m, "m", problem_space, problem)) {
|
||||
// default value
|
||||
this->m = 1024;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->n, "n", problem_space, problem)) {
|
||||
// default value
|
||||
this->n = 1024;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
|
||||
// default value
|
||||
this->split_k_slices = 1;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
|
||||
// default value
|
||||
this->batch_count = 1;
|
||||
@ -191,29 +191,29 @@ Status SymmOperationProfiler::SymmProblem::parse(
|
||||
}
|
||||
|
||||
if (!arg_as_scalar(
|
||||
this->alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
this->alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_scalar(
|
||||
this->beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
this->beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
|
||||
if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (operation_desc.side_mode == SideMode::kLeft) {
|
||||
this->lda = DeviceAllocation::get_packed_layout(
|
||||
operation_desc.A.layout, {int(this->m), int(this->m)}).front();
|
||||
@ -240,12 +240,12 @@ int64_t SymmOperationProfiler::SymmProblem::bytes(library::SymmDescription const
|
||||
if (operation_desc.side_mode == SideMode::kLeft) {
|
||||
bytes =
|
||||
int64_t(library::sizeof_bits(operation_desc.A.element) * m / 8) * (m + 1) / 2 +
|
||||
int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n +
|
||||
int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n +
|
||||
int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
|
||||
} else if (operation_desc.side_mode == SideMode::kRight) {
|
||||
bytes =
|
||||
int64_t(library::sizeof_bits(operation_desc.A.element) * n / 8) * (n + 1) / 2 +
|
||||
int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n +
|
||||
int64_t(library::sizeof_bits(operation_desc.B.element) * m / 8) * n +
|
||||
int64_t(library::sizeof_bits(operation_desc.C.element) * m / 8) * n;
|
||||
}
|
||||
// Set is_beta_zero true if beta is zero
|
||||
@ -277,7 +277,7 @@ int64_t SymmOperationProfiler::SymmProblem::flops(library::SymmDescription const
|
||||
case library::MathOperationID::kMultiplyAddComplex:
|
||||
flops_ *= 4;
|
||||
break;
|
||||
|
||||
|
||||
case library::MathOperationID::kMultiplyAddComplexFastF32:
|
||||
flops_ *= 4;
|
||||
break;
|
||||
@ -334,14 +334,14 @@ void SymmOperationProfiler::SymmProblem::initialize_result(
|
||||
|
||||
/// Extracts the problem dimensions
|
||||
Status SymmOperationProfiler::initialize_configuration(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::SymmDescription const &operation_desc =
|
||||
library::SymmDescription const &operation_desc =
|
||||
static_cast<library::SymmDescription const &>(operation->description());
|
||||
|
||||
if (operation_desc.symm_kind != library::SymmKind::kUniversal) {
|
||||
@ -349,14 +349,14 @@ Status SymmOperationProfiler::initialize_configuration(
|
||||
}
|
||||
|
||||
Status status = problem_.parse(operation_desc, problem_space, problem);
|
||||
|
||||
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
symm_workspace_.configuration.problem_size.m() = int(problem_.m);
|
||||
symm_workspace_.configuration.problem_size.n() = int(problem_.n);
|
||||
symm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft)
|
||||
symm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft)
|
||||
? int(problem_.m) : int(problem_.n);
|
||||
symm_workspace_.configuration.lda = problem_.lda;
|
||||
symm_workspace_.configuration.ldb = problem_.ldb;
|
||||
@ -374,14 +374,14 @@ Status SymmOperationProfiler::initialize_configuration(
|
||||
symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
|
||||
|
||||
initialize_result_(this->model_result_, options, operation_desc, problem_space);
|
||||
|
||||
|
||||
return operation->can_implement(&symm_workspace_.configuration, &symm_workspace_.arguments);
|
||||
}
|
||||
|
||||
/// Initializes the performance result
|
||||
void SymmOperationProfiler::initialize_result_(
|
||||
PerformanceResult &result,
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
library::SymmDescription const &operation_desc,
|
||||
ProblemSpace const &problem_space) {
|
||||
|
||||
@ -389,7 +389,7 @@ void SymmOperationProfiler::initialize_result_(
|
||||
result.disposition = Disposition::kNotRun;
|
||||
result.status = Status::kSuccess;
|
||||
result.operation_name = operation_desc.name;
|
||||
|
||||
|
||||
problem_.initialize_result(result, operation_desc, problem_space);
|
||||
|
||||
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
|
||||
@ -404,20 +404,31 @@ void SymmOperationProfiler::initialize_result_(
|
||||
|
||||
/// Initializes workspace
|
||||
Status SymmOperationProfiler::initialize_workspace(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::SymmDescription const &operation_desc =
|
||||
|
||||
if (options.device.devices.size() != 1) {
|
||||
throw std::runtime_error("This operation profiler only supports a single "
|
||||
"device.");
|
||||
}
|
||||
|
||||
cudaError_t result;
|
||||
result = cudaSetDevice(options.device.device_id(0));
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaSetDevice() failed.");
|
||||
}
|
||||
|
||||
library::SymmDescription const &operation_desc =
|
||||
static_cast<library::SymmDescription const &>(operation->description());
|
||||
|
||||
if (options.execution_mode != ExecutionMode::kDryRun) {
|
||||
int seed_shift = 0;
|
||||
if (operation_desc.side_mode == SideMode::kLeft) {
|
||||
symm_workspace_.A = device_context.allocate_tensor(
|
||||
symm_workspace_.A = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"A",
|
||||
operation_desc.A.element,
|
||||
@ -425,10 +436,11 @@ Status SymmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.m), int(problem_.m)},
|
||||
{int(problem_.lda)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
} else if (operation_desc.side_mode == SideMode::kRight) {
|
||||
symm_workspace_.A = device_context.allocate_tensor(
|
||||
symm_workspace_.A = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"A",
|
||||
operation_desc.A.element,
|
||||
@ -436,11 +448,12 @@ Status SymmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.n), int(problem_.n)},
|
||||
{int(problem_.lda)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
}
|
||||
|
||||
symm_workspace_.B = device_context.allocate_tensor(
|
||||
symm_workspace_.B = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"B",
|
||||
operation_desc.B.element,
|
||||
@ -448,10 +461,11 @@ Status SymmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldb)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
symm_workspace_.C = device_context.allocate_tensor(
|
||||
symm_workspace_.C = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"C",
|
||||
operation_desc.C.element,
|
||||
@ -459,23 +473,30 @@ Status SymmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldc)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
symm_workspace_.Computed = device_context.allocate_tensor(
|
||||
options,
|
||||
"D",
|
||||
operation_desc.C.element,
|
||||
operation_desc.C.layout,
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldc)}
|
||||
{int(problem_.ldc)},
|
||||
1, // batch_count
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
symm_workspace_.Reference = device_context.allocate_tensor(
|
||||
options,
|
||||
"Reference",
|
||||
operation_desc.C.element,
|
||||
operation_desc.C.layout,
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldc)}
|
||||
{int(problem_.ldc)},
|
||||
1, // batch_count
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
symm_workspace_.Computed->copy_from_device(symm_workspace_.C->data());
|
||||
@ -524,7 +545,7 @@ Status SymmOperationProfiler::initialize_workspace(
|
||||
|
||||
/// Verifies CUTLASS against references
|
||||
bool SymmOperationProfiler::verify_cutlass(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -553,7 +574,7 @@ bool SymmOperationProfiler::verify_cutlass(
|
||||
//
|
||||
|
||||
results_.back().status = operation->run(
|
||||
&symm_workspace_.arguments,
|
||||
&symm_workspace_.arguments,
|
||||
symm_workspace_.host_workspace.data(),
|
||||
symm_workspace_.device_workspace.data());
|
||||
|
||||
@ -601,8 +622,8 @@ bool SymmOperationProfiler::verify_cutlass(
|
||||
}
|
||||
}
|
||||
#endif // #if CUTLASS_ENABLE_CUBLAS
|
||||
|
||||
// Update disposition to worst case verification outcome among all
|
||||
|
||||
// Update disposition to worst case verification outcome among all
|
||||
// verification providers which are supported
|
||||
bool is_any_verification_run_passed = false;
|
||||
for(auto &m : results_.back().verification_map) {
|
||||
@ -628,7 +649,7 @@ bool SymmOperationProfiler::verify_cutlass(
|
||||
|
||||
/// Verifies CUTLASS against references
|
||||
bool SymmOperationProfiler::verify_with_cublas_(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -638,13 +659,13 @@ bool SymmOperationProfiler::verify_with_cublas_(
|
||||
|
||||
#if CUTLASS_ENABLE_CUBLAS
|
||||
|
||||
library::SymmDescription const &symm_desc =
|
||||
library::SymmDescription const &symm_desc =
|
||||
static_cast<library::SymmDescription const &>(operation->description());
|
||||
|
||||
//
|
||||
// Construct cuBLAS operators
|
||||
//
|
||||
|
||||
|
||||
CublasCreate handle;
|
||||
cublasStatus_t status = handle.get_cublas_create_status();
|
||||
|
||||
@ -673,8 +694,8 @@ bool SymmOperationProfiler::verify_with_cublas_(
|
||||
symm_workspace_.arguments.beta = problem_.beta.data();
|
||||
symm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
|
||||
|
||||
detail::cublasSymmDispatcher symm_op(
|
||||
symm_desc,
|
||||
detail::cublasSymmDispatcher symm_op(
|
||||
symm_desc,
|
||||
symm_workspace_.configuration,
|
||||
symm_workspace_.arguments
|
||||
);
|
||||
@ -706,7 +727,7 @@ bool SymmOperationProfiler::verify_with_cublas_(
|
||||
);
|
||||
|
||||
// Save workspace if incorrect
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
|
||||
|
||||
save_workspace(
|
||||
@ -731,7 +752,7 @@ bool SymmOperationProfiler::verify_with_cublas_(
|
||||
|
||||
/// Measures performance results
|
||||
bool SymmOperationProfiler::profile(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
|
||||
@ -31,7 +31,7 @@
|
||||
/* \file
|
||||
\brief Execution environment
|
||||
|
||||
|
||||
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
@ -54,7 +54,7 @@ namespace profiler {
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Ctor
|
||||
TrmmOperationProfiler::TrmmOperationProfiler(Options const &options):
|
||||
TrmmOperationProfiler::TrmmOperationProfiler(Options const &options):
|
||||
OperationProfiler(
|
||||
options,
|
||||
library::OperationKind::kTrmm,
|
||||
@ -113,7 +113,7 @@ void TrmmOperationProfiler::print_examples(std::ostream &out) const {
|
||||
|
||||
<< "Run a kernel with cta tile size of 256x128x32 and save workspace if results are incorrect (note that --cta-tile::k=32 is default cta-tile size):\n"
|
||||
<< " $ cutlass_profiler --operation=Trmm --cta_m=256 --cta_n=128 --cta_k=32 --save-workspace=incorrect\n\n"
|
||||
|
||||
|
||||
<< "Test your changes to trmm kernels with a quick functional test and save results in functional-test.csv:\n"
|
||||
<< " $ cutlass_profiler --operation=Trmm \\ \n"
|
||||
<< " --n=8,56,120,136,256,264,512,520,1024,1032,4096,8192,16384 \\ \n"
|
||||
@ -143,22 +143,22 @@ Status TrmmOperationProfiler::TrmmProblem::parse(
|
||||
library::TrmmDescription const &operation_desc,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
|
||||
if (!arg_as_int(this->m, "m", problem_space, problem)) {
|
||||
// default value
|
||||
this->m = 1024;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->n, "n", problem_space, problem)) {
|
||||
// default value
|
||||
this->n = 1024;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) {
|
||||
// default value
|
||||
this->split_k_slices = 1;
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) {
|
||||
// default value
|
||||
this->batch_count = 1;
|
||||
@ -182,29 +182,29 @@ Status TrmmOperationProfiler::TrmmProblem::parse(
|
||||
}
|
||||
|
||||
if (!arg_as_scalar(
|
||||
this->alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
this->alpha,
|
||||
operation_desc.element_epilogue,
|
||||
"alpha",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (!arg_as_scalar(
|
||||
this->beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
this->beta,
|
||||
operation_desc.element_epilogue,
|
||||
"beta",
|
||||
problem_space,
|
||||
problem)) {
|
||||
|
||||
|
||||
if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) {
|
||||
return Status::kErrorInternal;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (operation_desc.side_mode == SideMode::kLeft) {
|
||||
this->lda = DeviceAllocation::get_packed_layout(
|
||||
operation_desc.A.layout, {int(this->m), int(this->m)}).front();
|
||||
@ -265,14 +265,14 @@ void TrmmOperationProfiler::TrmmProblem::initialize_result(
|
||||
|
||||
/// Extracts the problem dimensions
|
||||
Status TrmmOperationProfiler::initialize_configuration(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::TrmmDescription const &operation_desc =
|
||||
library::TrmmDescription const &operation_desc =
|
||||
static_cast<library::TrmmDescription const &>(operation->description());
|
||||
|
||||
if (operation_desc.trmm_kind != library::TrmmKind::kUniversal) {
|
||||
@ -280,14 +280,14 @@ Status TrmmOperationProfiler::initialize_configuration(
|
||||
}
|
||||
|
||||
Status status = problem_.parse(operation_desc, problem_space, problem);
|
||||
|
||||
|
||||
if (status != Status::kSuccess) {
|
||||
return status;
|
||||
}
|
||||
|
||||
trmm_workspace_.configuration.problem_size.m() = int(problem_.m);
|
||||
trmm_workspace_.configuration.problem_size.n() = int(problem_.n);
|
||||
trmm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft)
|
||||
trmm_workspace_.configuration.problem_size.k() = (operation_desc.side_mode == SideMode::kLeft)
|
||||
? int(problem_.m) : int(problem_.n);
|
||||
trmm_workspace_.configuration.lda = problem_.lda;
|
||||
trmm_workspace_.configuration.ldb = problem_.ldb;
|
||||
@ -303,14 +303,14 @@ Status TrmmOperationProfiler::initialize_configuration(
|
||||
trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
|
||||
|
||||
initialize_result_(this->model_result_, options, operation_desc, problem_space);
|
||||
|
||||
|
||||
return operation->can_implement(&trmm_workspace_.configuration, &trmm_workspace_.arguments);
|
||||
}
|
||||
|
||||
/// Initializes the performance result
|
||||
void TrmmOperationProfiler::initialize_result_(
|
||||
PerformanceResult &result,
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
library::TrmmDescription const &operation_desc,
|
||||
ProblemSpace const &problem_space) {
|
||||
|
||||
@ -318,30 +318,30 @@ void TrmmOperationProfiler::initialize_result_(
|
||||
result.disposition = Disposition::kNotRun;
|
||||
result.status = Status::kSuccess;
|
||||
result.operation_name = operation_desc.name;
|
||||
|
||||
|
||||
problem_.initialize_result(result, operation_desc, problem_space);
|
||||
|
||||
OperationProfiler::initialize_result_(result, operation_desc, problem_space);
|
||||
|
||||
if (operation_desc.side_mode == SideMode::kLeft) {
|
||||
// Input bytes read and Output bytes written for the trmm problem
|
||||
result.bytes =
|
||||
result.bytes =
|
||||
// Half matrix including the diagonal will have (M*(M+1))/2 elements
|
||||
int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.m / 8) * (problem_.m + 1) / 2 +
|
||||
int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n +
|
||||
int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n +
|
||||
int64_t(library::sizeof_bits(operation_desc.D.element) * problem_.m / 8) * problem_.n;
|
||||
} else if (operation_desc.side_mode == SideMode::kRight) {
|
||||
// Input bytes read and Output bytes written for the trmm problem
|
||||
result.bytes =
|
||||
result.bytes =
|
||||
// Half matrix including the diagonal will have (N*(N+1))/2 elements
|
||||
int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.n / 8) * (problem_.n + 1) / 2 +
|
||||
int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n +
|
||||
int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.m / 8) * problem_.n +
|
||||
int64_t(library::sizeof_bits(operation_desc.D.element) * problem_.m / 8) * problem_.n;
|
||||
}
|
||||
|
||||
// FLOPs = 2 * [ ( M * (M+1)/2 * N ) ] // Beta is zero
|
||||
result.flops = problem_.m * (problem_.m + 1) * problem_.n;
|
||||
|
||||
|
||||
result.runtime = 0;
|
||||
|
||||
// complex-valued support
|
||||
@ -349,11 +349,11 @@ void TrmmOperationProfiler::initialize_result_(
|
||||
case library::MathOperationID::kMultiplyAddComplex:
|
||||
result.flops *= 4;
|
||||
break;
|
||||
|
||||
|
||||
case library::MathOperationID::kMultiplyAddComplexFastF32:
|
||||
result.flops *= 4;
|
||||
break;
|
||||
|
||||
|
||||
default: break;
|
||||
}
|
||||
|
||||
@ -361,20 +361,31 @@ void TrmmOperationProfiler::initialize_result_(
|
||||
|
||||
/// Initializes workspace
|
||||
Status TrmmOperationProfiler::initialize_workspace(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
ProblemSpace const &problem_space,
|
||||
ProblemSpace::Problem const &problem) {
|
||||
|
||||
library::TrmmDescription const &operation_desc =
|
||||
|
||||
if (options.device.devices.size() != 1) {
|
||||
throw std::runtime_error("This operation profiler only supports a single "
|
||||
"device.");
|
||||
}
|
||||
|
||||
cudaError_t result;
|
||||
result = cudaSetDevice(options.device.device_id(0));
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("cudaSetDevice() failed.");
|
||||
}
|
||||
|
||||
library::TrmmDescription const &operation_desc =
|
||||
static_cast<library::TrmmDescription const &>(operation->description());
|
||||
|
||||
if (options.execution_mode != ExecutionMode::kDryRun) {
|
||||
int seed_shift = 0;
|
||||
if (operation_desc.side_mode == SideMode::kLeft) {
|
||||
trmm_workspace_.A = device_context.allocate_tensor(
|
||||
trmm_workspace_.A = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"A",
|
||||
operation_desc.A.element,
|
||||
@ -382,10 +393,11 @@ Status TrmmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.m), int(problem_.m)},
|
||||
{int(problem_.lda)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
} else if (operation_desc.side_mode == SideMode::kRight) {
|
||||
trmm_workspace_.A = device_context.allocate_tensor(
|
||||
trmm_workspace_.A = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"A",
|
||||
operation_desc.A.element,
|
||||
@ -393,11 +405,12 @@ Status TrmmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.n), int(problem_.n)},
|
||||
{int(problem_.lda)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
}
|
||||
|
||||
trmm_workspace_.B = device_context.allocate_tensor(
|
||||
trmm_workspace_.B = device_context.allocate_and_initialize_tensor(
|
||||
options,
|
||||
"B",
|
||||
operation_desc.B.element,
|
||||
@ -405,23 +418,30 @@ Status TrmmOperationProfiler::initialize_workspace(
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldb)},
|
||||
1, // batch_count
|
||||
seed_shift++
|
||||
seed_shift++,
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
trmm_workspace_.Computed = device_context.allocate_tensor(
|
||||
options,
|
||||
"D",
|
||||
operation_desc.D.element,
|
||||
operation_desc.D.layout,
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldd)}
|
||||
{int(problem_.ldd)},
|
||||
1, // batch_count
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
trmm_workspace_.Reference = device_context.allocate_tensor(
|
||||
options,
|
||||
"Reference",
|
||||
operation_desc.D.element,
|
||||
operation_desc.D.layout,
|
||||
{int(problem_.m), int(problem_.n)},
|
||||
{int(problem_.ldd)}
|
||||
{int(problem_.ldd)},
|
||||
1, // batch_count
|
||||
0 // device_index
|
||||
);
|
||||
|
||||
}
|
||||
@ -467,7 +487,7 @@ Status TrmmOperationProfiler::initialize_workspace(
|
||||
|
||||
/// Verifies CUTLASS against references
|
||||
bool TrmmOperationProfiler::verify_cutlass(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -495,7 +515,7 @@ bool TrmmOperationProfiler::verify_cutlass(
|
||||
//
|
||||
|
||||
results_.back().status = operation->run(
|
||||
&trmm_workspace_.arguments,
|
||||
&trmm_workspace_.arguments,
|
||||
trmm_workspace_.host_workspace.data(),
|
||||
trmm_workspace_.device_workspace.data());
|
||||
|
||||
@ -543,8 +563,8 @@ bool TrmmOperationProfiler::verify_cutlass(
|
||||
}
|
||||
}
|
||||
#endif // #if CUTLASS_ENABLE_CUBLAS
|
||||
|
||||
// Update disposition to worst case verification outcome among all
|
||||
|
||||
// Update disposition to worst case verification outcome among all
|
||||
// verification providers which are supported
|
||||
bool is_any_verification_run_passed = false;
|
||||
for(auto &m : results_.back().verification_map) {
|
||||
@ -570,7 +590,7 @@ bool TrmmOperationProfiler::verify_cutlass(
|
||||
|
||||
/// Verifies CUTLASS against references
|
||||
bool TrmmOperationProfiler::verify_with_cublas_(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
@ -580,13 +600,13 @@ bool TrmmOperationProfiler::verify_with_cublas_(
|
||||
|
||||
#if CUTLASS_ENABLE_CUBLAS
|
||||
|
||||
library::TrmmDescription const &trmm_desc =
|
||||
library::TrmmDescription const &trmm_desc =
|
||||
static_cast<library::TrmmDescription const &>(operation->description());
|
||||
|
||||
//
|
||||
// Construct cuBLAS operators
|
||||
//
|
||||
|
||||
|
||||
CublasCreate handle;
|
||||
cublasStatus_t status = handle.get_cublas_create_status();
|
||||
|
||||
@ -614,8 +634,8 @@ bool TrmmOperationProfiler::verify_with_cublas_(
|
||||
trmm_workspace_.arguments.beta = problem_.beta.data();
|
||||
trmm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
|
||||
|
||||
detail::cublasTrmmDispatcher trmm_op(
|
||||
trmm_desc,
|
||||
detail::cublasTrmmDispatcher trmm_op(
|
||||
trmm_desc,
|
||||
trmm_workspace_.configuration,
|
||||
trmm_workspace_.arguments
|
||||
);
|
||||
@ -646,7 +666,7 @@ bool TrmmOperationProfiler::verify_with_cublas_(
|
||||
);
|
||||
|
||||
// Save workspace if incorrect
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
if (options.verification.save_workspace == SaveWorkspace::kIncorrect &&
|
||||
results_.back().verification_map[library::Provider::kCUBLAS] == Disposition::kIncorrect) {
|
||||
|
||||
save_workspace(
|
||||
@ -671,7 +691,7 @@ bool TrmmOperationProfiler::verify_with_cublas_(
|
||||
|
||||
/// Measures performance results
|
||||
bool TrmmOperationProfiler::profile(
|
||||
Options const &options,
|
||||
Options const &options,
|
||||
PerformanceReport &report,
|
||||
DeviceContext &device_context,
|
||||
library::Operation const *operation,
|
||||
|
||||
@ -37,9 +37,11 @@
|
||||
*/
|
||||
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
|
||||
#include "cutlass/platform/platform.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/trace.h"
|
||||
#include "exceptions.h"
|
||||
|
||||
namespace cutlass {
|
||||
@ -61,8 +63,20 @@ T* allocate(size_t count = 1) {
|
||||
cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
|
||||
|
||||
if (cuda_error != cudaSuccess) {
|
||||
#if (CUTLASS_DEBUG_TRACE_LEVEL > 0)
|
||||
std::ostringstream os;
|
||||
os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes=" << bytes;
|
||||
CUTLASS_TRACE_HOST(os.str());
|
||||
#endif
|
||||
throw cuda_exception("Failed to allocate memory", cuda_error);
|
||||
}
|
||||
#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
|
||||
else {
|
||||
std::ostringstream os;
|
||||
os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes=" << bytes;
|
||||
CUTLASS_TRACE_HOST(os.str());
|
||||
}
|
||||
#endif
|
||||
|
||||
return ptr;
|
||||
}
|
||||
@ -85,11 +99,36 @@ void free(T* ptr) {
|
||||
template <typename T>
|
||||
void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
|
||||
size_t bytes = count * sizeof_bits<T>::value / 8;
|
||||
if (bytes == 0 && count > 0)
|
||||
if (bytes == 0 && count > 0) {
|
||||
bytes = 1;
|
||||
}
|
||||
cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
|
||||
if (cuda_error != cudaSuccess) {
|
||||
throw cuda_exception("cudaMemcpy() failed", cuda_error);
|
||||
std::ostringstream os;
|
||||
os << "cutlass::device_memory::copy: cudaMemcpy() failed: "
|
||||
<< "dst=" << dst << ", src=" << src
|
||||
<< ", bytes=" << bytes << ", count=" << count;
|
||||
if (kind == cudaMemcpyHostToDevice) {
|
||||
os << ", kind=cudaMemcpyHostToDevice";
|
||||
}
|
||||
else if (kind == cudaMemcpyDeviceToHost) {
|
||||
os << ", kind=cudaMemcpyDeviceToHost";
|
||||
}
|
||||
else if (kind == cudaMemcpyDeviceToDevice) {
|
||||
os << ", kind=cudaMemcpyDeviceToDevice";
|
||||
}
|
||||
else if (kind == cudaMemcpyHostToHost) {
|
||||
os << ", kind=cudaMemcpyHostToHost";
|
||||
}
|
||||
else if (kind == cudaMemcpyDefault) {
|
||||
os << ", kind=cudaMemcpyDefault";
|
||||
}
|
||||
else {
|
||||
os << ", kind=Unknown";
|
||||
}
|
||||
os << ", error: " << cudaGetErrorString(cuda_error);
|
||||
|
||||
throw cuda_exception(os.str().c_str(), cuda_error);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -51,6 +51,8 @@ struct Distribution {
|
||||
struct {
|
||||
double min;
|
||||
double max;
|
||||
// Percent elements set to NaN
|
||||
double pnan;
|
||||
} uniform;
|
||||
|
||||
/// Gaussian distribution
|
||||
@ -82,17 +84,18 @@ struct Distribution {
|
||||
|
||||
Distribution() : kind(Invalid), int_scale(0) {}
|
||||
|
||||
/// Configures distribution as uniform random
|
||||
Distribution &set_uniform(double _min, double _max, int _int_scale = 0) {
|
||||
/// Configures distribution as uniform random
|
||||
Distribution &set_uniform(double _min, double _max, int _int_scale = 0, double _pnan = 0) {
|
||||
kind = Uniform;
|
||||
uniform.min = _min;
|
||||
uniform.max = _max;
|
||||
int_scale = _int_scale;
|
||||
uniform.pnan = _pnan;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// Configures distribution as Gaussian distribution
|
||||
Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0, double _pnz = 100.0) {
|
||||
Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0, double _pnz = 1.0) {
|
||||
kind = Gaussian;
|
||||
gaussian.mean = _mean;
|
||||
gaussian.stddev = _stddev;
|
||||
@ -125,7 +128,8 @@ struct Distribution {
|
||||
inline std::ostream &operator<<(std::ostream &out, cutlass::Distribution const &dist) {
|
||||
switch (dist.kind) {
|
||||
case cutlass::Distribution::Uniform:
|
||||
out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max;
|
||||
out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max
|
||||
<< ", pnan: " << dist.uniform.pnan;
|
||||
break;
|
||||
case cutlass::Distribution::Gaussian:
|
||||
out << "gaussian, mean: " << dist.gaussian.mean << ", stddev: " << dist.gaussian.stddev
|
||||
|
||||
@ -177,16 +177,25 @@ public:
|
||||
void reserve(
|
||||
size_t count, ///< size of tensor in elements
|
||||
bool device_backed_ = true) { ///< if true, device memory is also allocated
|
||||
#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
|
||||
CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve(count=" << count << ", device_backed_=" << (device_backed_ ? "true" : "false") << ")");
|
||||
#endif
|
||||
|
||||
device_.reset();
|
||||
host_.clear();
|
||||
|
||||
size_t count_container = count_to_container_storage_unit_count(count);
|
||||
#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
|
||||
CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: host_.resize(" << count_container << ")");
|
||||
#endif
|
||||
host_.resize(count_container);
|
||||
|
||||
// Allocate memory
|
||||
StorageUnit* device_memory = nullptr;
|
||||
if (device_backed_) {
|
||||
#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
|
||||
CUTLASS_TRACE_HOST("cutlass::HostTensor::reserve: device_memory::allocate(" << count_container << ")");
|
||||
#endif
|
||||
device_memory = device_memory::allocate<StorageUnit>(count_container);
|
||||
}
|
||||
device_.reset(device_memory, device_backed_ ? count_container : 0);
|
||||
@ -394,7 +403,7 @@ public:
|
||||
void sync_device() {
|
||||
if (device_backed()) {
|
||||
device_memory::copy_to_device(
|
||||
device_.get(), host_.data(), host_.capacity());
|
||||
device_.get(), host_.data(), host_.size());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -35,6 +35,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "cute/layout.hpp"
|
||||
#include "cute/container/array.hpp" // cute::array
|
||||
#include "cutlass/conv/convolution.h" // cutlass::conv::Operator
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
@ -57,6 +57,7 @@
|
||||
#include "cutlass/complex.h"
|
||||
#include "cutlass/tensor_view.h"
|
||||
#include "cutlass/blas3.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/layout/vector.h"
|
||||
|
||||
@ -117,6 +118,7 @@ struct RandomGaussianFunc {
|
||||
int int_scale;
|
||||
FloatType float_scale_up;
|
||||
FloatType float_scale_down;
|
||||
int exclude_zero; ///< If non-negative, excludes zeros
|
||||
|
||||
//
|
||||
// Methods
|
||||
@ -127,12 +129,14 @@ struct RandomGaussianFunc {
|
||||
uint64_t seed_ = 0,
|
||||
Element mean_ = 0,
|
||||
Element stddev_ = 1,
|
||||
int int_scale_ = -1
|
||||
int int_scale_ = -1,
|
||||
int exclude_zero_ = -1
|
||||
):
|
||||
seed(seed_),
|
||||
mean(static_cast<FloatType>(mean_)),
|
||||
stddev(static_cast<FloatType>(stddev_)),
|
||||
int_scale(int_scale_) {
|
||||
int_scale(int_scale_),
|
||||
exclude_zero(exclude_zero_) {
|
||||
|
||||
float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
|
||||
float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
|
||||
@ -178,6 +182,15 @@ struct RandomGaussianFunc {
|
||||
result = Element(rnd);
|
||||
}
|
||||
|
||||
if (params.exclude_zero >=0 && result == Element(0.0)) {
|
||||
if (rnd > FloatType(0)) {
|
||||
rnd += FloatType(1);
|
||||
} else {
|
||||
rnd -= FloatType(1);
|
||||
}
|
||||
result = Element(rnd);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
};
|
||||
@ -203,6 +216,7 @@ struct RandomGaussianFunc<complex<Real>> {
|
||||
int int_scale;
|
||||
FloatType float_scale_up;
|
||||
FloatType float_scale_down;
|
||||
int exclude_zero; ///< If non-negative, excludes zeros
|
||||
|
||||
//
|
||||
// Methods
|
||||
@ -213,12 +227,14 @@ struct RandomGaussianFunc<complex<Real>> {
|
||||
uint64_t seed_ = 0,
|
||||
Real mean_ = 0,
|
||||
Real stddev_ = 1,
|
||||
int int_scale_ = -1
|
||||
int int_scale_ = -1,
|
||||
int exclude_zero_ = -1
|
||||
):
|
||||
seed(seed_),
|
||||
mean(static_cast<FloatType>(mean_)),
|
||||
stddev(static_cast<FloatType>(stddev_)),
|
||||
int_scale(int_scale_) {
|
||||
int_scale(int_scale_),
|
||||
exclude_zero(exclude_zero_) {
|
||||
|
||||
float_scale_up = FloatType(IntType(1) << int_scale);
|
||||
float_scale_up += FloatType(0.5) * float_scale_up;
|
||||
@ -272,6 +288,18 @@ struct RandomGaussianFunc<complex<Real>> {
|
||||
result = Element(Real(rnd_r), Real(rnd_i));
|
||||
}
|
||||
|
||||
if (params.exclude_zero >= 0 &&
|
||||
result.real() == Real(0.0) &&
|
||||
result.imag() == Real(0.0)) {
|
||||
|
||||
if (rnd_r > FloatType(0)) {
|
||||
rnd_r += FloatType(1);
|
||||
} else {
|
||||
rnd_r -= FloatType(1);
|
||||
}
|
||||
result = Element(Real(rnd_r), Real(rnd_i));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
};
|
||||
@ -358,6 +386,7 @@ void TensorFillRandomGaussian(
|
||||
int bits = -1, ///< If non-negative, specifies number of fractional bits that
|
||||
/// are not truncated to zero. Permits reducing precision of
|
||||
/// data.
|
||||
int exclude_zero = -1, ///< If non-negative, excludes zeros from tensor init
|
||||
cudaStream_t stream = nullptr) {
|
||||
|
||||
using RandomFunc = detail::RandomGaussianFunc<Element>;
|
||||
@ -366,7 +395,7 @@ void TensorFillRandomGaussian(
|
||||
|
||||
TensorForEach<Func, Layout::kRank, Params>(
|
||||
view.extent(),
|
||||
Params(view, typename RandomFunc::Params(seed, mean, stddev, bits)),
|
||||
Params(view, typename RandomFunc::Params(seed, mean, stddev, bits, exclude_zero)),
|
||||
/*grid_size*/0, /*block_size*/0,
|
||||
stream
|
||||
);
|
||||
@ -399,7 +428,7 @@ void BlockFillRandomGaussian(
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// Computes a random Gaussian distribution
|
||||
/// Computes a random uniform distribution
|
||||
template <typename Element> ///< Element type
|
||||
struct RandomUniformFunc {
|
||||
|
||||
@ -424,8 +453,10 @@ struct RandomUniformFunc {
|
||||
FloatType range;
|
||||
FloatType max;
|
||||
int int_scale;
|
||||
double pnan;
|
||||
FloatType float_scale_up;
|
||||
FloatType float_scale_down;
|
||||
int exclude_zero; ///< If non-negative, excludes zeros
|
||||
|
||||
/// Default ctor
|
||||
CUTLASS_HOST_DEVICE
|
||||
@ -440,15 +471,25 @@ struct RandomUniformFunc {
|
||||
uint64_t seed_ = 0,
|
||||
Element max_ = 1,
|
||||
Element min = 0,
|
||||
int int_scale_ = -1
|
||||
int int_scale_ = -1,
|
||||
double pnan_ = 0,
|
||||
int exclude_zero_ = -1
|
||||
):
|
||||
seed(seed_),
|
||||
range(static_cast<FloatType>(max_) - static_cast<FloatType>(min)),
|
||||
max(static_cast<FloatType>(max_)),
|
||||
int_scale(int_scale_) {
|
||||
int_scale(int_scale_),
|
||||
pnan(pnan_),
|
||||
exclude_zero(exclude_zero_) {
|
||||
|
||||
float_scale_up = FloatType(IntType(2) << int_scale); // scale up to clamp low order bits
|
||||
float_scale_down = FloatType(1) / FloatType(IntType(2) << int_scale);
|
||||
|
||||
// Handle cases where min = 0 or max = 0 for excluding zeros
|
||||
if (exclude_zero >= 0) {
|
||||
range = (min == Element(0)) ? range - FloatType(1): range;
|
||||
max = (max_ == Element(0)) ? max - FloatType(1): max;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -479,6 +520,13 @@ struct RandomUniformFunc {
|
||||
CUTLASS_DEVICE
|
||||
Element operator()() {
|
||||
|
||||
// Draw random float in [0.0, 1.0] to determine if element should be NaN.
|
||||
if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
|
||||
if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
|
||||
return Element(NAN);
|
||||
}
|
||||
}
|
||||
|
||||
FloatType rnd = random_uniform_float<FloatType>(&rng_state);
|
||||
rnd = params.max - params.range * rnd;
|
||||
|
||||
@ -494,6 +542,15 @@ struct RandomUniformFunc {
|
||||
result = Element(rnd);
|
||||
}
|
||||
|
||||
if (params.exclude_zero >=0 && result == Element(0.0)) {
|
||||
if (rnd > FloatType(0)) {
|
||||
rnd = std::min(params.max, rnd + FloatType(1));
|
||||
} else {
|
||||
rnd = std::max((params.max - params.range), rnd - FloatType(1));
|
||||
}
|
||||
result = Element(rnd);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
};
|
||||
@ -525,8 +582,10 @@ struct RandomUniformFunc<complex<Real>> {
|
||||
FloatType range;
|
||||
FloatType min;
|
||||
int int_scale;
|
||||
double pnan;
|
||||
FloatType float_scale_up;
|
||||
FloatType float_scale_down;
|
||||
int exclude_zero; ///< If non-negative, excludes zeros
|
||||
|
||||
/// Default ctor
|
||||
CUTLASS_HOST_DEVICE
|
||||
@ -541,16 +600,26 @@ struct RandomUniformFunc<complex<Real>> {
|
||||
uint64_t seed_ = 0,
|
||||
FloatType max = 1,
|
||||
FloatType min_ = 0,
|
||||
int int_scale_ = -1
|
||||
int int_scale_ = -1,
|
||||
double pnan_ = 0,
|
||||
int exclude_zero_ = -1
|
||||
):
|
||||
seed(seed_),
|
||||
range(static_cast<FloatType>(max - min_)),
|
||||
min(static_cast<FloatType>(min_)),
|
||||
int_scale(int_scale_) {
|
||||
int_scale(int_scale_),
|
||||
pnan(pnan_),
|
||||
exclude_zero(exclude_zero_) {
|
||||
|
||||
float_scale_up = FloatType(IntType(1) << int_scale);
|
||||
float_scale_up += FloatType(0.5) * float_scale_up;
|
||||
float_scale_down = FloatType(1) / FloatType(IntType(1) << int_scale);
|
||||
|
||||
// Handle cases where min = 0 or max = 0 for excluding zeros
|
||||
if (exclude_zero >= 0) {
|
||||
min = (min == FloatType(0)) ? min + FloatType(1): min;
|
||||
range = (max == FloatType(0)) ? range - FloatType(1): range;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -581,6 +650,13 @@ struct RandomUniformFunc<complex<Real>> {
|
||||
CUTLASS_DEVICE
|
||||
Element operator()() {
|
||||
|
||||
// Draw random float in [0.0, 1.0] to determine if element should be NaN.
|
||||
if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
|
||||
if (params.pnan > 0 && (curand_uniform(&rng_state) < (params.pnan))) {
|
||||
return Element(Real(NAN), Real(NAN));
|
||||
}
|
||||
}
|
||||
|
||||
FloatType rnd_r = random_uniform_float<FloatType>(&rng_state);
|
||||
FloatType rnd_i = random_uniform_float<FloatType>(&rng_state);
|
||||
|
||||
@ -604,11 +680,23 @@ struct RandomUniformFunc<complex<Real>> {
|
||||
result = Element(Real(rnd_r), Real(rnd_i));
|
||||
}
|
||||
|
||||
if (params.exclude_zero >= 0 &&
|
||||
result.real() == Real(0.0) &&
|
||||
result.imag() == Real(0.0)) {
|
||||
|
||||
if (rnd_r > FloatType(0)) {
|
||||
rnd_r = std::min(params.min + params.range, rnd_r + FloatType(1));
|
||||
} else {
|
||||
rnd_r = std::max((params.min), rnd_r - FloatType(1));
|
||||
}
|
||||
result = Element(Real(rnd_r), Real(rnd_i));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
/// Computes a random Gaussian distribution
|
||||
/// Computes a random uniform distribution
|
||||
template <
|
||||
typename Element, ///< Element type
|
||||
typename Layout> ///< Layout function
|
||||
@ -693,13 +781,15 @@ void TensorFillRandomUniform(
|
||||
int bits = -1, ///< If non-negative, specifies number of fractional bits that
|
||||
/// are not truncated to zero. Permits reducing precision of
|
||||
/// data.
|
||||
double pnan = 0, ///< Percentage of NaN elements.
|
||||
int exclude_zero = -1, ///< If non-negative, excludes zeros from tensor init
|
||||
cudaStream_t stream = nullptr) {
|
||||
|
||||
using RandomFunc = detail::RandomUniformFunc<Element>;
|
||||
using Func = detail::TensorFillRandomUniformFunc<Element, Layout>;
|
||||
using Params = typename Func::Params;
|
||||
|
||||
typename RandomFunc::Params random(seed, max, min, bits);
|
||||
typename RandomFunc::Params random(seed, max, min, bits, pnan, exclude_zero);
|
||||
|
||||
TensorForEach<Func, Layout::kRank, Params>(
|
||||
view.extent(),
|
||||
@ -722,11 +812,12 @@ void BlockFillRandomUniform(
|
||||
int bits = -1, ///< If non-negative, specifies number of fractional bits that
|
||||
/// are not truncated to zero. Permits reducing precision of
|
||||
/// data.
|
||||
double pnan = 0, ///< Percentage of NaN elements.
|
||||
cudaStream_t stream = nullptr) {
|
||||
|
||||
using RandomFunc = detail::RandomUniformFunc<Element>;
|
||||
|
||||
typename RandomFunc::Params params(seed, max, min, bits);
|
||||
typename RandomFunc::Params params(seed, max, min, bits, pnan);
|
||||
|
||||
BlockForEach<Element, RandomFunc>(ptr, capacity, params, /*grid_size*/0, /*block_size*/0, stream);
|
||||
}
|
||||
@ -1672,7 +1763,11 @@ void TensorFillRandom(
|
||||
TensorView<Element, Layout> view, ///< destination tensor
|
||||
uint64_t seed,
|
||||
Distribution dist,
|
||||
cudaStream_t stream = nullptr) {
|
||||
cudaStream_t stream = nullptr,
|
||||
int exclude_zero = -1 ///< If non-negative, excludes 0.
|
||||
/// Note that setting this flag will result in more 1's,
|
||||
/// as we use a simple mechanism to replace 0's by adding/subtracting 1's.
|
||||
) {
|
||||
|
||||
using Real = typename RealType<Element>::Type;
|
||||
|
||||
@ -1683,6 +1778,7 @@ void TensorFillRandom(
|
||||
static_cast<Real>(dist.gaussian.mean),
|
||||
static_cast<Real>(dist.gaussian.stddev),
|
||||
dist.int_scale,
|
||||
exclude_zero,
|
||||
stream);
|
||||
} else if (dist.kind == Distribution::Uniform) {
|
||||
TensorFillRandomUniform<Element, Layout>(
|
||||
@ -1691,6 +1787,8 @@ void TensorFillRandom(
|
||||
static_cast<Real>(dist.uniform.max),
|
||||
static_cast<Real>(dist.uniform.min),
|
||||
dist.int_scale,
|
||||
dist.uniform.pnan,
|
||||
exclude_zero,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
@ -1753,6 +1851,7 @@ void BlockFillRandom(
|
||||
static_cast<Real>(dist.uniform.max),
|
||||
static_cast<Real>(dist.uniform.min),
|
||||
dist.int_scale,
|
||||
dist.uniform.pnan,
|
||||
stream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -128,7 +128,8 @@ template<
|
||||
class EpilogueFusionParams
|
||||
>
|
||||
struct ConvReferenceImpl {
|
||||
using ElementAcc = typename EpilogueFusionParams::ElementAcc;
|
||||
// Hard code accumlulator type to float to avoid data lost in accumulating add.
|
||||
using ElementAcc = cutlass::platform::conditional_t<cutlass::platform::is_same_v<typename EpilogueFusionParams::ElementAcc, double>, double, float>;
|
||||
using ElementC = typename EpilogueFusionParams::ElementC;
|
||||
using ElementOut = typename EpilogueFusionParams::ElementOut;
|
||||
using ElementScalar = typename EpilogueFusionParams::ElementScalar;
|
||||
|
||||
@ -342,7 +342,8 @@ void gett_epilogue(
|
||||
ElementCompute converted_acc = accumulator_converter(acc[m_b][n_b]);
|
||||
// per-row alpha
|
||||
if (raw_pointer_cast(epilogue_params.Valpha.data())) {
|
||||
converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b));
|
||||
converted_alpha = scale_converter(epilogue_params.Valpha(m + m_b, n + n_b, l));
|
||||
converted_alpha = mul(converted_alpha, mul(converted_scale_a, converted_scale_b));
|
||||
}
|
||||
ElementCompute output = mul(converted_alpha, converted_acc);
|
||||
|
||||
@ -355,7 +356,8 @@ void gett_epilogue(
|
||||
ElementCompute converted_src = source_converter(epilogue_params.C(m + m_b, n + n_b, l));
|
||||
// per-row beta
|
||||
if (epilogue_params.Vbeta.data()) {
|
||||
converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b));
|
||||
converted_beta = scale_converter(epilogue_params.Vbeta(m + m_b, n + n_b, l));
|
||||
converted_beta = mul(converted_beta, converted_scale_c);
|
||||
}
|
||||
output = epilogue_fma(converted_beta, converted_src, output);
|
||||
}
|
||||
|
||||
@ -159,6 +159,7 @@ struct RandomGaussianFunc {
|
||||
int int_scale;
|
||||
double pi;
|
||||
double pnz;
|
||||
bool exclude_zero;
|
||||
|
||||
//
|
||||
// Methods
|
||||
@ -168,9 +169,10 @@ struct RandomGaussianFunc {
|
||||
double mean_ = 0,
|
||||
double stddev_ = 1,
|
||||
int int_scale_ = -1,
|
||||
double pnz_ = 100.0
|
||||
double pnz_ = 1.0,
|
||||
bool exclude_zero_ = false
|
||||
):
|
||||
seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_) {
|
||||
seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
|
||||
std::srand((unsigned)seed);
|
||||
}
|
||||
|
||||
@ -191,7 +193,7 @@ struct RandomGaussianFunc {
|
||||
// Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
|
||||
std::random_device rnd_device;
|
||||
std::mt19937 bernoulli_rnd(rnd_device());
|
||||
std::bernoulli_distribution bernoulli_dist(pnz / 100);
|
||||
std::bernoulli_distribution bernoulli_dist(pnz);
|
||||
bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
|
||||
|
||||
// Sample from the Gaussian distribution for a nonzero element
|
||||
@ -208,6 +210,16 @@ struct RandomGaussianFunc {
|
||||
result = static_cast<Element>(0);
|
||||
}
|
||||
|
||||
// Note that exclude_zero = true will disable the bernoulli_result above by unsetting zeros
|
||||
if (exclude_zero && result == Element(0)) {
|
||||
if (rnd > 0) {
|
||||
rnd += 1;
|
||||
} else {
|
||||
rnd -= 1;
|
||||
}
|
||||
result = Element(rnd);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
};
|
||||
@ -222,6 +234,7 @@ struct RandomGaussianFunc<complex<Element> > {
|
||||
int int_scale;
|
||||
double pi;
|
||||
double pnz;
|
||||
bool exclude_zero;
|
||||
|
||||
//
|
||||
// Methods
|
||||
@ -231,9 +244,10 @@ struct RandomGaussianFunc<complex<Element> > {
|
||||
double mean_ = 0,
|
||||
double stddev_ = 1,
|
||||
int int_scale_ = -1,
|
||||
double pnz_ = 100.0
|
||||
double pnz_ = 1.0,
|
||||
bool exclude_zero_ = false
|
||||
):
|
||||
seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_) {
|
||||
seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
|
||||
std::srand((unsigned)seed);
|
||||
}
|
||||
|
||||
@ -249,7 +263,7 @@ struct RandomGaussianFunc<complex<Element> > {
|
||||
// Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
|
||||
std::random_device rnd_device;
|
||||
std::mt19937 bernoulli_rnd(rnd_device());
|
||||
std::bernoulli_distribution bernoulli_dist(pnz / 100);
|
||||
std::bernoulli_distribution bernoulli_dist(pnz);
|
||||
bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
|
||||
|
||||
// Sample from the Gaussian distribution for a nonzero element
|
||||
@ -270,6 +284,19 @@ struct RandomGaussianFunc<complex<Element> > {
|
||||
reals[1] = from_real<Element>(0);
|
||||
}
|
||||
|
||||
// Note that this will invalidate the above else statement because it unsets zero elements
|
||||
if (exclude_zero &&
|
||||
reals[0] == from_real<Element>(0.0) &&
|
||||
reals[1] == from_real<Element>(0.0)) {
|
||||
|
||||
if (rnd[0] > 0.0) {
|
||||
rnd[0] += 1.0;
|
||||
} else {
|
||||
rnd[0] -= 1.0;
|
||||
}
|
||||
reals[0] = from_real<Element>(rnd[0]);
|
||||
}
|
||||
|
||||
return complex<Element>(reals[0], reals[1]);
|
||||
}
|
||||
};
|
||||
@ -284,6 +311,7 @@ struct RandomGaussianFunc<Quaternion<Element> > {
|
||||
int int_scale;
|
||||
double pi;
|
||||
double pnz;
|
||||
bool exclude_zero;
|
||||
|
||||
//
|
||||
// Methods
|
||||
@ -293,9 +321,10 @@ struct RandomGaussianFunc<Quaternion<Element> > {
|
||||
double mean_ = 0,
|
||||
double stddev_ = 1,
|
||||
int int_scale_ = -1,
|
||||
double pnz_ = 100.0
|
||||
double pnz_ = 1.0,
|
||||
bool exclude_zero_ = false
|
||||
):
|
||||
seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_) {
|
||||
seed(seed_), mean(mean_), stddev(stddev_), int_scale(int_scale_), pi(std::acos(-1)), pnz(pnz_), exclude_zero(exclude_zero_) {
|
||||
std::srand((unsigned)seed);
|
||||
}
|
||||
|
||||
@ -313,7 +342,7 @@ struct RandomGaussianFunc<Quaternion<Element> > {
|
||||
// Sample from the Bernoulli distribution, and use the result to sample from the Gaussian
|
||||
std::random_device rnd_device;
|
||||
std::mt19937 bernoulli_rnd(rnd_device());
|
||||
std::bernoulli_distribution bernoulli_dist(pnz / 100);
|
||||
std::bernoulli_distribution bernoulli_dist(pnz);
|
||||
bool bernoulli_result = bernoulli_dist(bernoulli_rnd);
|
||||
|
||||
// Sample from the Gaussian distribution for a nonzero element
|
||||
@ -343,6 +372,21 @@ struct RandomGaussianFunc<Quaternion<Element> > {
|
||||
reals[3] = from_real<Element>(0);
|
||||
}
|
||||
|
||||
// Note that this will invalidate the above else statement because it unsets zero elements
|
||||
if (exclude_zero &&
|
||||
reals[0] == from_real<Element>(0) &&
|
||||
reals[1] == from_real<Element>(0) &&
|
||||
reals[2] == from_real<Element>(0) &&
|
||||
reals[3] == from_real<Element>(0)) {
|
||||
|
||||
if (rnd1[0] > 0.0) {
|
||||
rnd1[0] += 1.0;
|
||||
} else {
|
||||
rnd1[0] -= 1.0;
|
||||
}
|
||||
reals[0] = from_real<Element>(rnd1[0]);
|
||||
}
|
||||
|
||||
return Quaternion<Element>(reals[0], reals[1], reals[2], reals[3]);
|
||||
}
|
||||
};
|
||||
@ -440,10 +484,11 @@ void TensorFillRandomGaussian(
|
||||
double mean = 0, ///< Gaussian distribution's mean
|
||||
double stddev = 1, ///< Gaussian distribution's standard deviation
|
||||
int bits = -1, ///< If non-negative, specifies number of fractional bits that
|
||||
double pnz = 100.0) { /// are not truncated to zero. Permits reducing precision of
|
||||
double pnz = 1.0, /// are not truncated to zero. Permits reducing precision of
|
||||
/// data.
|
||||
bool exclude_zero = false) { ///< Exclude zeros from tensor init.
|
||||
|
||||
detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
|
||||
detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz, exclude_zero);
|
||||
|
||||
detail::TensorFillGaussianFunc<Element, Layout> func(
|
||||
dst,
|
||||
@ -466,8 +511,9 @@ void TensorFillRandomGaussian(
|
||||
double mean = 0, ///< Gaussian distribution's mean
|
||||
double stddev = 1, ///< Gaussian distribution's standard deviation
|
||||
int bits = -1, ///< If non-negative, specifies number of fractional bits that
|
||||
double pnz = 100.0) { /// are not truncated to zero. Permits reducing precision of
|
||||
double pnz = 1.0, /// are not truncated to zero. Permits reducing precision of
|
||||
/// data.
|
||||
bool exclude_zero = false) { ///< Exclude zeros from tensor init.
|
||||
|
||||
TensorFillRandomGaussian(dst.view_real(), seed, mean, stddev, bits, pnz);
|
||||
TensorFillRandomGaussian(dst.view_imag(), ~seed, mean, stddev, bits, pnz);
|
||||
@ -485,7 +531,7 @@ void TensorFillSymmetricRandomGaussian(
|
||||
double mean = 0, ///< Gaussian distribution's mean
|
||||
double stddev = 1, ///< Gaussian distribution's standard deviation
|
||||
int bits = -1, ///< If non-negative, specifies number of fractional bits that
|
||||
double pnz = 100.0) { /// are not truncated to zero. Permits reducing precision of
|
||||
double pnz = 1.0) { /// are not truncated to zero. Permits reducing precision of
|
||||
/// data.
|
||||
|
||||
detail::RandomGaussianFunc<Element> random_func(seed, mean, stddev, bits, pnz);
|
||||
@ -515,7 +561,7 @@ void BlockFillRandomGaussian(
|
||||
double mean = 0, ///< Gaussian distribution's mean
|
||||
double stddev = 1, ///< Gaussian distribution's standard deviation
|
||||
int bits = -1, ///< If non-negative, specifies number of fractional bits that
|
||||
double pnz = 100.0) { /// are not truncated to zero. Permits reducing precision of
|
||||
double pnz = 1.0) { /// are not truncated to zero. Permits reducing precision of
|
||||
/// data.
|
||||
|
||||
|
||||
@ -542,23 +588,47 @@ struct RandomUniformFunc {
|
||||
double min;
|
||||
int int_scale;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
double pnan;
|
||||
private:
|
||||
using engine_type = std::mt19937;
|
||||
public:
|
||||
engine_type bernoulli_rnd;
|
||||
std::bernoulli_distribution bernoulli_dist;
|
||||
|
||||
bool exclude_zero;
|
||||
|
||||
RandomUniformFunc(
|
||||
uint64_t seed_ = 0,
|
||||
double max = 1,
|
||||
double min_ = 0,
|
||||
int int_scale_ = -1
|
||||
int int_scale_ = -1,
|
||||
double pnan_ = 0,
|
||||
bool exclude_zero_ = false
|
||||
):
|
||||
seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
|
||||
seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
|
||||
, bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
|
||||
, bernoulli_dist(pnan_)
|
||||
, exclude_zero(exclude_zero_)
|
||||
{
|
||||
std::srand((unsigned)seed);
|
||||
}
|
||||
|
||||
// Handle cases where min = 0 or max = 0 for excluding zeros
|
||||
if (exclude_zero) {
|
||||
min = (min == 0.0) ? min + 1: min;
|
||||
range = (max == 0.0) ? range - 1: range;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Compute random value and update RNG state
|
||||
Element operator()() const {
|
||||
Element operator()() {
|
||||
|
||||
// Sample from NaN distribution.
|
||||
if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
|
||||
if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
|
||||
return Element(NAN);
|
||||
}
|
||||
}
|
||||
|
||||
double rnd = double(std::rand()) / double(RAND_MAX);
|
||||
|
||||
@ -575,6 +645,15 @@ struct RandomUniformFunc {
|
||||
result = static_cast<Element>(Real(rnd));
|
||||
}
|
||||
|
||||
if (exclude_zero && result == Element(0)) {
|
||||
if (rnd > 0.0) {
|
||||
rnd = std::min(min + range, rnd + 1.0);
|
||||
} else {
|
||||
rnd = std::max(min, rnd - 1.0);
|
||||
}
|
||||
result = static_cast<Element>(Real(rnd));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
};
|
||||
@ -590,6 +669,15 @@ struct RandomUniformFunc<complex<Element> > {
|
||||
double min;
|
||||
int int_scale;
|
||||
|
||||
double pnan;
|
||||
private:
|
||||
using engine_type = std::mt19937;
|
||||
public:
|
||||
engine_type bernoulli_rnd;
|
||||
std::bernoulli_distribution bernoulli_dist;
|
||||
|
||||
bool exclude_zero;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
@ -598,15 +686,33 @@ struct RandomUniformFunc<complex<Element> > {
|
||||
uint64_t seed_ = 0,
|
||||
double max = 1,
|
||||
double min_ = 0,
|
||||
int int_scale_ = -1
|
||||
int int_scale_ = -1,
|
||||
double pnan_ = 0,
|
||||
bool exclude_zero_ = false
|
||||
):
|
||||
seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
|
||||
seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_)
|
||||
, bernoulli_rnd{static_cast<engine_type::result_type>(seed_)}
|
||||
, bernoulli_dist(pnan_)
|
||||
, exclude_zero(exclude_zero_) {
|
||||
std::srand((unsigned)seed);
|
||||
}
|
||||
|
||||
// Handle cases where min = 0 or max = 0 for excluding zeros
|
||||
if (exclude_zero) {
|
||||
min = (min == 0.0) ? min + 1: min;
|
||||
range = (max == 0.0) ? range - 1: range;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Compute random value and update RNG state
|
||||
complex<Element> operator()() const {
|
||||
complex<Element> operator()() {
|
||||
|
||||
// Sample from NaN distribution.
|
||||
if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
|
||||
if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
|
||||
return Element(NAN);
|
||||
}
|
||||
}
|
||||
|
||||
Element reals[2];
|
||||
|
||||
@ -625,6 +731,19 @@ struct RandomUniformFunc<complex<Element> > {
|
||||
else {
|
||||
reals[i] = from_real<Element>(Real(rnd));
|
||||
}
|
||||
|
||||
if (exclude_zero &&
|
||||
i == 0 &&
|
||||
reals[0] == from_real<Element>(0.0)) {
|
||||
|
||||
if (rnd > 0.0) {
|
||||
rnd = std::min(min + range, rnd + 1.0);
|
||||
} else {
|
||||
rnd = std::max(min, rnd - 1.0);
|
||||
}
|
||||
reals[0] = from_real<Element>(Real(rnd));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return complex<Element>(reals[0], reals[1]);
|
||||
@ -642,6 +761,13 @@ struct RandomUniformFunc<Quaternion<Element> > {
|
||||
double min;
|
||||
int int_scale;
|
||||
|
||||
double pnan;
|
||||
private:
|
||||
using engine_type = std::mt19937;
|
||||
public:
|
||||
engine_type bernoulli_rnd;
|
||||
std::bernoulli_distribution bernoulli_dist;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
@ -650,15 +776,26 @@ struct RandomUniformFunc<Quaternion<Element> > {
|
||||
uint64_t seed_ = 0,
|
||||
double max = 1,
|
||||
double min_ = 0,
|
||||
int int_scale_ = -1
|
||||
int int_scale_ = -1,
|
||||
double pnan_ = 0
|
||||
):
|
||||
seed(seed_), range(max - min_), min(min_), int_scale(int_scale_) {
|
||||
std::srand((unsigned)seed);
|
||||
}
|
||||
seed(seed_), range(max - min_), min(min_), int_scale(int_scale_), pnan(pnan_),
|
||||
bernoulli_rnd{static_cast<engine_type::result_type>(seed_)},
|
||||
bernoulli_dist(pnan_)
|
||||
{
|
||||
std::srand((unsigned)seed);
|
||||
}
|
||||
|
||||
|
||||
/// Compute random value and update RNG state
|
||||
Quaternion<Element> operator()() const {
|
||||
Quaternion<Element> operator()() {
|
||||
|
||||
// Sample from NaN distribution.
|
||||
if constexpr (std::numeric_limits<Element>::has_quiet_NaN) {
|
||||
if (pnan > 0 && bernoulli_dist(bernoulli_rnd)) {
|
||||
return Element(NAN);
|
||||
}
|
||||
}
|
||||
|
||||
Element reals[4];
|
||||
|
||||
@ -712,7 +849,7 @@ struct TensorFillRandomUniformFunc {
|
||||
}
|
||||
|
||||
/// Compute random value and update RNG state
|
||||
void operator()(Coord<Layout::kRank> const &coord) const {
|
||||
void operator()(Coord<Layout::kRank> const &coord) {
|
||||
|
||||
view.at(coord) = func();
|
||||
}
|
||||
@ -749,7 +886,7 @@ struct TensorFillSymmetricRandomUniformFunc {
|
||||
}
|
||||
|
||||
/// Compute random value and update RNG state
|
||||
void operator()(Coord<Layout::kRank> const &coord) const {
|
||||
void operator()(Coord<Layout::kRank> const &coord) {
|
||||
// Fill half of matrix based on FillMode
|
||||
if (Layout::kRank == 2 &&
|
||||
fill_mode == cutlass::FillMode::kLower &&
|
||||
@ -796,7 +933,7 @@ struct TensorFillPadDiagonalRandomUniformFunc {
|
||||
}
|
||||
|
||||
/// Compute random value and update RNG state
|
||||
void operator()(Coord<Layout::kRank> const &coord) const {
|
||||
void operator()(Coord<Layout::kRank> const &coord) {
|
||||
// Fill half of matrix based on FillMode
|
||||
if (Layout::kRank == 2 &&
|
||||
(fill_mode == cutlass::FillMode::kLower) &&
|
||||
@ -825,10 +962,12 @@ void TensorFillRandomUniform(
|
||||
uint64_t seed, ///< seed for RNG
|
||||
double max = 1, ///< upper bound of distribution
|
||||
double min = 0, ///< lower bound for distribution
|
||||
int bits = -1) { ///< If non-negative, specifies number of fractional bits that
|
||||
int bits = -1, ///< If non-negative, specifies number of fractional bits that
|
||||
/// are not truncated to zero. Permits reducing precision of
|
||||
/// data.
|
||||
detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
|
||||
/// data.
|
||||
double pnan = 0, ///< Percentage of NaN elements.
|
||||
bool exclude_zero = false) { ///< Exclude zero from tensor init
|
||||
detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan, exclude_zero);
|
||||
|
||||
detail::TensorFillRandomUniformFunc<Element, Layout> func(
|
||||
dst,
|
||||
@ -850,12 +989,14 @@ void TensorFillRandomUniform(
|
||||
uint64_t seed, ///< seed for RNG
|
||||
double max = 1, ///< upper bound of distribution
|
||||
double min = 0, ///< lower bound for distribution
|
||||
int bits = -1) { ///< If non-negative, specifies number of fractional bits that
|
||||
int bits = -1, ///< If non-negative, specifies number of fractional bits that
|
||||
/// are not truncated to zero. Permits reducing precision of
|
||||
/// data.
|
||||
double pnan = 0, ///< Percentage of NaN elements.
|
||||
bool exclude_zero = false) { ///< Exclude zero from tensor init
|
||||
|
||||
TensorFillRandomUniform(dst.view_real(), seed, max, min, bits);
|
||||
TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits);
|
||||
TensorFillRandomUniform(dst.view_real(), seed, max, min, bits, pnan, exclude_zero);
|
||||
TensorFillRandomUniform(dst.view_imag(), ~seed, max, min, bits, pnan, exclude_zero);
|
||||
}
|
||||
|
||||
|
||||
@ -972,10 +1113,11 @@ void BlockFillRandomUniform(
|
||||
uint64_t seed, ///< seed for RNG
|
||||
double max = 1, ///< upper bound of distribution
|
||||
double min = 0, ///< lower bound for distribution
|
||||
int bits = -1) { ///< If non-negative, specifies number of fractional bits that
|
||||
int bits = -1, ///< If non-negative, specifies number of fractional bits that
|
||||
/// are not truncated to zero. Permits reducing precision of
|
||||
/// data.
|
||||
detail::RandomUniformFunc<Element> random_func(seed, max, min, bits);
|
||||
/// data.
|
||||
double pnan = 0) { ///< Percentage of NaN elements.
|
||||
detail::RandomUniformFunc<Element> random_func(seed, max, min, bits, pnan);
|
||||
|
||||
for (size_t i = 0; i < capacity; ++i) {
|
||||
ReferenceFactory<Element>::get(ptr, i) = random_func();
|
||||
@ -1259,7 +1401,11 @@ template <
|
||||
void TensorFillRandom(
|
||||
TensorView<Element, Layout> view, ///< destination tensor
|
||||
uint64_t seed,
|
||||
Distribution dist) {
|
||||
Distribution dist,
|
||||
bool exclude_zero = false ///< If true, excludes 0.
|
||||
/// Note that setting this flag will result in more 1's,
|
||||
/// as we use a simple mechanism to replace 0's by adding/subtracting 1's.
|
||||
) {
|
||||
|
||||
using Real = typename RealType<Element>::Type;
|
||||
|
||||
@ -1269,14 +1415,18 @@ void TensorFillRandom(
|
||||
seed,
|
||||
dist.gaussian.mean,
|
||||
dist.gaussian.stddev,
|
||||
dist.int_scale);
|
||||
dist.int_scale,
|
||||
dist.gaussian.pnz,
|
||||
exclude_zero);
|
||||
} else if (dist.kind == Distribution::Uniform) {
|
||||
TensorFillRandomUniform(
|
||||
view,
|
||||
seed,
|
||||
dist.uniform.max,
|
||||
dist.uniform.min,
|
||||
dist.int_scale);
|
||||
dist.int_scale,
|
||||
dist.uniform.pnan,
|
||||
exclude_zero);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1354,7 +1504,8 @@ void BlockFillRandom(
|
||||
seed,
|
||||
dist.uniform.max,
|
||||
dist.uniform.min,
|
||||
dist.int_scale);
|
||||
dist.int_scale,
|
||||
dist.uniform.pnan);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user