CUTLASS 3.2.1 (#1113)

* Updates for 3.2.1 release.

* Minor fix in gemm op profiler for raster order.

* Add scheduler mapping for raster order in the kernels.
This commit is contained in:
ANIKET SHIVAM
2023-09-26 14:24:26 -07:00
committed by GitHub
parent e0aaa3c3b3
commit 90d3b0fb18
428 changed files with 22253 additions and 21762 deletions

View File

@ -73,7 +73,7 @@ set_target_properties(cutlass_profiler PROPERTIES EXPORT_NAME profiler)
target_include_directories(
cutlass_profiler
PRIVATE
${CMAKE_CURRENT_LIST_DIR}/src
${CMAKE_CURRENT_LIST_DIR}/include
)
#
@ -97,14 +97,14 @@ install(
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM --operation=Gemm --providers=cutlass --verification-providers=cublas,device --junit-output=test_cutlass_profiler_gemm)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D --operation=Conv2d --providers=cutlass --verification-providers=cudnn,device --junit-output=test_cutlass_profiler_conv2d)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D --operation=Conv3d --providers=cutlass --verification-providers=cudnn,device,host --junit-output=test_cutlass_profiler_conv3d)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM --operation=SparseGemm --providers=cutlass --verification-providers=cublas,device,host --junit-output=test_cutlass_profiler_spgemm)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K --operation=RankK --providers=cutlass --verification-providers=cublas --junit-output=test_cutlass_profiler_rank_k)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K --operation=Rank2K --providers=cutlass --verification-providers=cublas --junit-output=test_cutlass_profiler_rank_2k)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM --operation=Trmm --providers=cutlass --verification-providers=device,host --junit-output=test_cutlass_profiler_trmm)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM --operation=Symm --providers=cutlass --verification-providers=cublas,host --junit-output=test_cutlass_profiler_symm)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM --operation=Gemm --providers=cutlass --verification-providers=cublas,device --junit-output=test_cutlass_profiler_gemm --print-kernel-before-running=true)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D --operation=Conv2d --providers=cutlass --verification-providers=cudnn,device --junit-output=test_cutlass_profiler_conv2d --print-kernel-before-running=true)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D --operation=Conv3d --providers=cutlass --verification-providers=cudnn,device,host --junit-output=test_cutlass_profiler_conv3d --print-kernel-before-running=true)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM --operation=SparseGemm --providers=cutlass --verification-providers=cublas,device,host --junit-output=test_cutlass_profiler_spgemm --print-kernel-before-running=true)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K --operation=RankK --providers=cutlass --verification-providers=cublas --junit-output=test_cutlass_profiler_rank_k --print-kernel-before-running=true)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K --operation=Rank2K --providers=cutlass --verification-providers=cublas --junit-output=test_cutlass_profiler_rank_2k --print-kernel-before-running=true)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM --operation=Trmm --providers=cutlass --verification-providers=device,host --junit-output=test_cutlass_profiler_trmm --print-kernel-before-running=true)
set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM --operation=Symm --providers=cutlass --verification-providers=cublas,host --junit-output=test_cutlass_profiler_symm --print-kernel-before-running=true)
cutlass_add_executable_tests(
test_profiler cutlass_profiler

View File

@ -383,6 +383,8 @@ public:
/// Destructor
virtual ~Conv2dOperationProfiler();
Conv2dProblem const& problem() const { return problem_; }
/// Prints usage statement for the math function
virtual void print_usage(std::ostream &out) const;

View File

@ -332,6 +332,8 @@ public:
/// Destructor
virtual ~Conv3dOperationProfiler();
Conv3dProblem const& problem() const { return problem_; }
/// Prints usage statement for the math function
virtual void print_usage(std::ostream &out) const;

View File

@ -82,6 +82,7 @@ public:
int split_k_slices;
int batch_count;
cutlass::library::RasterOrder raster_order;
// gemm with parallel interleaved reduction
// gemm epilogue (alpha, beta) = (1.0, 0.0)
// reduction epilogue (alpha, beta) = (GemmProblem::alpha, GemmProblem::beta)
@ -94,7 +95,8 @@ public:
GemmProblem():
mode(library::GemmUniversalMode::kGemm),
m(16), n(16), k(16), lda(0), ldb(0), ldc(0), split_k_slices(1), batch_count(1) { }
m(16), n(16), k(16), lda(0), ldb(0), ldc(0), split_k_slices(1), batch_count(1),
raster_order(cutlass::library::RasterOrder::kHeuristic){ }
/// Parses the problem
Status parse(
@ -178,6 +180,8 @@ public:
/// Destructor
virtual ~GemmOperationProfiler();
GemmProblem const& problem() const { return problem_; }
/// Prints usage statement for the math function
virtual void print_usage(std::ostream &out) const;

View File

@ -247,6 +247,10 @@ public:
/// Sort results by (currently by flops-per-byte)
bool sort_results;
/// Prints the name of the kernel being profiled before running the kernel.
/// This is useful for determining which kernel is causing a run of the profiler to hang
bool print_kernel_before_running;
//
// Methods
//

View File

@ -935,6 +935,15 @@ bool arg_as_IteratorAlgorithmID(
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem);
/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
bool arg_as_RasterOrder(library::RasterOrder &raster_order, KernelArgument::Value const *value_ptr);
/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
bool arg_as_RasterOrder(
library::RasterOrder &raster_order,
char const *name,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem);
/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
bool arg_as_ProviderID(library::Provider &provider, KernelArgument::Value const *value_ptr);

View File

@ -39,9 +39,8 @@
#include "cutlass/core_io.h"
#include "conv2d_operation_profiler.h"
#include "gpu_timer.h"
#include "cutlass/profiler/conv2d_operation_profiler.h"
#include "cutlass/profiler/gpu_timer.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
using namespace cutlass::library;

View File

@ -40,9 +40,8 @@
#include "cutlass/core_io.h"
#include "conv3d_operation_profiler.h"
#include "gpu_timer.h"
#include "cutlass/profiler/conv3d_operation_profiler.h"
#include "cutlass/profiler/gpu_timer.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
using namespace cutlass::library;

View File

@ -35,7 +35,7 @@
#include <stdexcept>
#if CUTLASS_ENABLE_CUBLAS
#include "cublas_helpers.h"
#include "cutlass/profiler/cublas_helpers.h"
namespace cutlass {
namespace profiler {

View File

@ -35,7 +35,7 @@
#include <stdexcept>
#include "cudnn_helpers.h"
#include "cutlass/profiler/cudnn_helpers.h"
namespace cutlass {
namespace profiler {

View File

@ -36,15 +36,15 @@
#include <stdexcept>
// Profiler includes
#include "cutlass_profiler.h"
#include "gemm_operation_profiler.h"
#include "rank_k_operation_profiler.h"
#include "rank_2k_operation_profiler.h"
#include "trmm_operation_profiler.h"
#include "symm_operation_profiler.h"
#include "conv2d_operation_profiler.h"
#include "conv3d_operation_profiler.h"
#include "sparse_gemm_operation_profiler.h"
#include "cutlass/profiler/cutlass_profiler.h"
#include "cutlass/profiler/gemm_operation_profiler.h"
#include "cutlass/profiler/rank_k_operation_profiler.h"
#include "cutlass/profiler/rank_2k_operation_profiler.h"
#include "cutlass/profiler/trmm_operation_profiler.h"
#include "cutlass/profiler/symm_operation_profiler.h"
#include "cutlass/profiler/conv2d_operation_profiler.h"
#include "cutlass/profiler/conv3d_operation_profiler.h"
#include "cutlass/profiler/sparse_gemm_operation_profiler.h"
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -46,7 +46,7 @@
#include "cutlass/library/util.h"
#include "device_allocation.h"
#include "cutlass/profiler/device_allocation.h"
namespace cutlass {
namespace profiler {

View File

@ -32,7 +32,7 @@
\brief
*/
#include "device_context.h"
#include "cutlass/profiler/device_context.h"
namespace cutlass {
namespace profiler {

View File

@ -32,7 +32,7 @@
\brief Provides several functions for filling tensors with data.
*/
#include "enumerated_types.h"
#include "cutlass/profiler/enumerated_types.h"
namespace cutlass {
namespace profiler {

View File

@ -39,10 +39,9 @@
#include "cutlass/core_io.h"
#include "cublas_helpers.h"
#include "gemm_operation_profiler.h"
#include "gpu_timer.h"
#include "cutlass/profiler/cublas_helpers.h"
#include "cutlass/profiler/gemm_operation_profiler.h"
#include "cutlass/profiler/gpu_timer.h"
#include "cutlass/library/singleton.h"
#include "cutlass/library/library.h"
#include "cutlass/library/handle.h"
@ -74,6 +73,7 @@ GemmOperationProfiler::GemmOperationProfiler(Options const &options):
{ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "Variant of split K mode(serial, parallel)"},
{ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"},
{ArgumentTypeID::kInteger, {"batch_count", "batch-count"}, "Number of GEMMs computed in one batch"},
{ArgumentTypeID::kEnumerated, {"raster_order", "raster-order"}, "Raster order (heuristic, along_n, along_m)"},
},
{ library::Provider::kCUBLAS}
) {
@ -174,7 +174,7 @@ Status GemmOperationProfiler::GemmProblem::parse(
}
this->mode = library::GemmUniversalMode::kGemm;
if(this->split_k_mode == library::SplitKMode::kParallel) {
if (this->split_k_mode == library::SplitKMode::kParallel) {
this->mode = library::GemmUniversalMode::kGemmSplitKParallel;
}
@ -190,6 +190,11 @@ Status GemmOperationProfiler::GemmProblem::parse(
this->mode = library::GemmUniversalMode::kBatched;
}
if (!arg_as_RasterOrder(this->raster_order, "raster_order", problem_space, problem)) {
// default value
this->raster_order = library::RasterOrder::kHeuristic;
}
if (this->split_k_slices > 1 && this->batch_count > 1) {
// At least one of these must be one
return Status::kErrorInvalidProblem;
@ -322,6 +327,7 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
set_argument(result, "split_k_mode", problem_space, library::to_string(split_k_mode));
set_argument(result, "split_k_slices", problem_space, split_k_slices);
set_argument(result, "batch_count", problem_space, batch_count);
set_argument(result, "raster_order", problem_space, library::to_string(raster_order));
set_argument(result, "alpha", problem_space,
library::lexical_cast(alpha, operation_desc.element_epilogue));
@ -376,6 +382,8 @@ Status GemmOperationProfiler::initialize_configuration(
gemm_workspace_.arguments.alpha = problem_.alpha.data();
gemm_workspace_.arguments.beta = problem_.beta.data();
gemm_workspace_.arguments.pointer_mode = library::ScalarPointerMode::kHost;
gemm_workspace_.arguments.raster_order = problem_.raster_order;
// initialize reduction operation for parallel splitKMode
if (problem_.split_k_mode == library::SplitKMode::kParallel) {
if (!initialize_reduction_configuration_(operation, problem)) {
@ -610,7 +618,7 @@ Status GemmOperationProfiler::initialize_workspace(
results_.back().op_kind = library::OperationKind::kGemm;
results_.back().disposition = Disposition::kNotRun;
for(auto provider : verification_providers_) {
for (auto provider : verification_providers_) {
results_.back().verification_map[provider] = Disposition::kNotRun;
}
}
@ -1102,7 +1110,6 @@ Status GemmOperationProfiler::profile_cutlass_(
void *device_workspace) {
GpuTimer timer;
// initialize gemm underlying operation to handle parallel reduction
library::Operation const * underlying_operation = operation;
@ -1223,7 +1230,6 @@ Status GemmOperationProfiler::profile_cutlass_(
//
timer.stop_and_wait();
//
// Update performance result
//

View File

@ -34,7 +34,7 @@
#include <stdexcept>
#include "gpu_timer.h"
#include "cutlass/profiler/gpu_timer.h"
namespace cutlass {
namespace profiler {

View File

@ -34,9 +34,9 @@
#include <iostream>
#include "options.h"
#include "cutlass/profiler/options.h"
#include "cutlass_profiler.h"
#include "cutlass/profiler/cutlass_profiler.h"
///////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -47,9 +47,9 @@
// sleep not supported
#endif
#include "options.h"
#include "operation_profiler.h"
#include "gpu_timer.h"
#include "cutlass/profiler/options.h"
#include "cutlass/profiler/operation_profiler.h"
#include "cutlass/profiler/gpu_timer.h"
///////////////////////////////////////////////////////////////////////////////////////////////////
@ -100,12 +100,11 @@ OperationProfiler::OperationProfiler(
verification_providers_.push_back(provider);
}
}
}
/// Destructor
OperationProfiler::~OperationProfiler() {
}
OperationProfiler::~OperationProfiler() {}
/// Gets the schema description
std::string const & OperationProfiler::description() const {
@ -349,6 +348,11 @@ int OperationProfiler::profile_all(
if (continue_profiling) {
if (options.report.print_kernel_before_running) {
std::cout << "Profiling kernel for JUnit test " << options.report.junit_output_path << ": "
<< operation_name << std::endl;
}
status = this->initialize_workspace(
options,
report,
@ -679,7 +683,7 @@ bool OperationProfiler::find_string_matches_(
// Search filter_tokens in operation_name in order
size_t start = 0, idx = 0;
for(auto & token : filter_tokens) {
for (auto & token : filter_tokens) {
// Check if characters left to be parsed in operation_name
if (start < operation_name.length()) {
// Find token in operation_name[start:]

View File

@ -39,7 +39,7 @@
#include "cutlass/library/util.h"
#include "options.h"
#include "cutlass/profiler/options.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
@ -145,7 +145,7 @@ void Options::Device::print_device_info(std::ostream &out) const {
out << "Device Name,SM,CUDA Device ID,Phy Device ID" << std::endl;
for(int device = 0; device < num_devices; device++) {
for (int device = 0; device < num_devices; device++) {
result = cudaSetDevice(device);
if (result != cudaSuccess) {
throw std::runtime_error("cudaSetDevice() failed for device");
@ -587,7 +587,7 @@ Options::Report::Report(cutlass::CommandLine const &cmdline) {
cmdline.get_cmd_line_argument("append", append, false);
cmdline.get_cmd_line_argument("output", output_path);
cmdline.get_cmd_line_argument("junit-output", junit_output_path);
if (cmdline.check_cmd_line_flag("tags")) {
cmdline.get_cmd_line_argument_pairs("tags", pivot_tags);
}
@ -597,6 +597,8 @@ Options::Report::Report(cutlass::CommandLine const &cmdline) {
cmdline.get_cmd_line_argument("verbose", verbose, true);
cmdline.get_cmd_line_argument("sort-results", sort_results, false);
cmdline.get_cmd_line_argument("print-kernel-before-running", print_kernel_before_running, false);
}
void Options::Report::print_usage(std::ostream &out) const {
@ -613,6 +615,10 @@ void Options::Report::print_usage(std::ostream &out) const {
<< " --junit-output=<path> "
<< " Path to junit output file for result reporting. Operation kind and '.junit.xml' is appended.\n\n"
<< " --print-kernel-before-running=<bool> "
<< " Prints the name of the kernel being profiled before running the kernel." << end_of_line
<< " This is useful for determining which kernel is causing a run of the profiler to hang\n\n"
<< " --report-not-run=<bool> "
<< " If true, reports the status of all kernels including those that" << end_of_line
<< " do not satisfy the given arguments.\n\n"
@ -634,7 +640,8 @@ void Options::Report::print_options(std::ostream &out, int indent) const {
<< indent_str(indent) << "append: " << append << "\n"
<< indent_str(indent) << "output: " << output_path << "\n"
<< indent_str(indent) << "junit-output: " << junit_output_path << "\n"
<< indent_str(indent) << "report_not_run: " << report_not_run << "\n"
<< indent_str(indent) << "print-kernel-before-running: " << print_kernel_before_running << "\n"
<< indent_str(indent) << "report-not-run: " << report_not_run << "\n"
<< indent_str(indent) << "tags:\n";
for (auto const & tag : pivot_tags) {

View File

@ -42,8 +42,8 @@
#include "cutlass/library/util.h"
#include "performance_report.h"
#include "debug.h"
#include "cutlass/profiler/performance_report.h"
#include "cutlass/profiler/debug.h"
namespace cutlass {
namespace profiler {
@ -382,6 +382,7 @@ std::ostream & PerformanceReport::print_result_csv_(
<< "," << result.gbytes_per_sec()
<< "," << result.gflops_per_sec()
;
}
else {
out << std::string(2

View File

@ -39,8 +39,8 @@
#include "cutlass/cutlass.h"
// CUTLASS Profiler includes
#include "enumerated_types.h"
#include "performance_result.h"
#include "cutlass/profiler/enumerated_types.h"
#include "cutlass/profiler/performance_result.h"
// CUTLASS Library includes
#include "cutlass/library/library.h"

View File

@ -38,7 +38,7 @@
#include "cutlass/library/util.h"
#include "problem_space.h"
#include "cutlass/profiler/problem_space.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
@ -845,6 +845,46 @@ bool arg_as_NumericTypeID(
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
bool arg_as_RasterOrder(
library::RasterOrder &raster_order,
KernelArgument::Value const *value_ptr) {
if (value_ptr->not_null) {
if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) {
raster_order = library::from_string<library::RasterOrder>(
static_cast<EnumeratedTypeArgument::EnumeratedTypeValue const *>(value_ptr)->element);
if (raster_order == library::RasterOrder::kInvalid) {
throw std::runtime_error(
"arg_as_RasterOrder() - illegal cast.");
}
}
else {
throw std::runtime_error(
"arg_as_RasterOrder() - illegal cast.");
}
return true;
}
return false;
}
/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
bool arg_as_RasterOrder(
library::RasterOrder &raster_order,
char const *name,
ProblemSpace const &problem_space,
ProblemSpace::Problem const &problem) {
size_t idx = problem_space.argument_index(name);
KernelArgument::Value const *value_ptr = problem.at(idx).get();
return arg_as_RasterOrder(raster_order, value_ptr);
}
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
bool arg_as_LayoutTypeID(
library::LayoutTypeID &layout_type,

View File

@ -41,9 +41,9 @@
#include "cutlass/core_io.h"
#include "cublas_helpers.h"
#include "rank_2k_operation_profiler.h"
#include "gpu_timer.h"
#include "cutlass/profiler/cublas_helpers.h"
#include "cutlass/profiler/rank_2k_operation_profiler.h"
#include "cutlass/profiler/gpu_timer.h"
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -41,9 +41,9 @@
#include "cutlass/core_io.h"
#include "cublas_helpers.h"
#include "rank_k_operation_profiler.h"
#include "gpu_timer.h"
#include "cutlass/profiler/cublas_helpers.h"
#include "cutlass/profiler/rank_k_operation_profiler.h"
#include "cutlass/profiler/gpu_timer.h"
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -38,9 +38,9 @@
#include <iomanip>
#include <ios>
#include "cublas_helpers.h"
#include "sparse_gemm_operation_profiler.h"
#include "gpu_timer.h"
#include "cutlass/profiler/cublas_helpers.h"
#include "cutlass/profiler/sparse_gemm_operation_profiler.h"
#include "cutlass/profiler/gpu_timer.h"
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -41,9 +41,9 @@
#include "cutlass/core_io.h"
#include "cublas_helpers.h"
#include "symm_operation_profiler.h"
#include "gpu_timer.h"
#include "cutlass/profiler/cublas_helpers.h"
#include "cutlass/profiler/symm_operation_profiler.h"
#include "cutlass/profiler/gpu_timer.h"
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -41,9 +41,9 @@
#include "cutlass/core_io.h"
#include "cublas_helpers.h"
#include "trmm_operation_profiler.h"
#include "gpu_timer.h"
#include "cutlass/profiler/cublas_helpers.h"
#include "cutlass/profiler/trmm_operation_profiler.h"
#include "cutlass/profiler/gpu_timer.h"
/////////////////////////////////////////////////////////////////////////////////////////////////