Updates for 3.1 (#932)

This commit is contained in:
ANIKET SHIVAM
2023-04-29 06:34:27 -07:00
committed by GitHub
parent 6f8596ce3f
commit 7c04f95415
51 changed files with 1796 additions and 328 deletions

View File

@ -107,15 +107,17 @@ set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM --operation=Symm --provid
cutlass_add_executable_tests(
test_profiler cutlass_profiler
DEPENDEES test_all
TEST_COMMAND_OPTIONS
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM
TEST_COMMAND_OPTIONS
GEMM
CONV2D
CONV3D
SPGEMM
RANK_K
RANK_2K
TRMM
SYMM
TEST_COMMAND_OPTIONS_PREFIX
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_
DISABLE_EXECUTABLE_INSTALL_RULE
)

View File

@ -124,7 +124,7 @@ int CutlassProfiler::operator()() {
options_.execution_mode == ExecutionMode::kTrace) {
// Profiles all operations
profile_();
return profile_();
}
else if (options_.execution_mode == ExecutionMode::kEnumerate) {
// Enumerates all operations
@ -157,7 +157,7 @@ int CutlassProfiler::profile_() {
if (result) {
return result;
}
}
}
}

View File

@ -462,6 +462,13 @@ size_t DeviceAllocation::bytes() const {
/// Copies from an equivalent-sized tensor in device memory
void DeviceAllocation::copy_from_device(void const *ptr) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping copy of size 0 allocation\n";
#endif
return;
}
cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyDeviceToDevice);
if (result != cudaSuccess) {
throw std::runtime_error("Failed device-to-device copy");
@ -470,22 +477,43 @@ void DeviceAllocation::copy_from_device(void const *ptr) {
/// Copies from an equivalent-sized tensor in device memory
void DeviceAllocation::copy_from_host(void const *ptr) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping copy of size 0 allocation\n";
#endif
return;
}
cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyHostToDevice);
if (result != cudaSuccess) {
throw std::runtime_error("Failed device-to-device copy");
throw std::runtime_error("Failed host-to-device copy");
}
}
/// Copies from an equivalent-sized tensor in device memory
void DeviceAllocation::copy_to_host(void *ptr) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping copy of size 0 allocation\n";
#endif
return;
}
cudaError_t result = cudaMemcpy(ptr, data(), bytes(), cudaMemcpyDeviceToHost);
if (result != cudaSuccess) {
throw std::runtime_error("Failed device-to-device copy");
throw std::runtime_error("Failed device-to-host copy");
}
}
void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
if (!good()) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping initialization of size 0 allocation\n";
#endif
return;
}
if (!data()) {
throw std::runtime_error("Attempting to initialize invalid allocation.");
}
@ -690,7 +718,14 @@ void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
}
void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
if (!good()) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping initialization of size 0 allocation\n";
#endif
return;
}
if (!data()) {
throw std::runtime_error("Attempting to initialize invalid allocation.");
}
@ -699,7 +734,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
switch (type_) {
case library::NumericTypeID::kFE4M3:
cutlass::reference::host::BlockFillRandom<cutlass::float_e4m3_t>(
reinterpret_cast<cutlass::float_e4m3_t *>(pointer_),
reinterpret_cast<cutlass::float_e4m3_t *>(host_data.data()),
capacity_,
seed,
dist
@ -707,7 +742,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
break;
case library::NumericTypeID::kFE5M2:
cutlass::reference::host::BlockFillRandom<cutlass::float_e5m2_t>(
reinterpret_cast<cutlass::float_e5m2_t *>(pointer_),
reinterpret_cast<cutlass::float_e5m2_t *>(host_data.data()),
capacity_,
seed,
dist
@ -904,7 +939,14 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
}
void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSizeInBits) {
if (!good()) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping initialization of size 0 allocation\n";
#endif
return;
}
if (!data()) {
throw std::runtime_error("Attempting to initialize invalid allocation.");
}
@ -934,7 +976,14 @@ void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSiz
}
void DeviceAllocation::initialize_random_sparsemeta_host(int seed, int MetaSizeInBits) {
if (!good()) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping initialization of size 0 allocation\n";
#endif
return;
}
if (!data()) {
throw std::runtime_error("Attempting to initialize invalid allocation.");
}

View File

@ -68,6 +68,7 @@ GemmOperationProfiler::GemmOperationProfiler(Options const &options):
{ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
{ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
{ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
{ArgumentTypeID::kTensor, {"D"}, "Tensor storing the D output"},
{ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
{ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
{ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "Variant of split K mode(serial, parallel)"},
@ -206,6 +207,10 @@ Status GemmOperationProfiler::GemmProblem::parse(
return Status::kErrorInvalidProblem;
}
if (!tensor_description_satisfies(operation_desc.D, "D", problem_space, problem)) {
return Status::kErrorInvalidProblem;
}
if (!arg_as_scalar(
this->alpha,
operation_desc.element_epilogue,
@ -307,6 +312,9 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
set_argument(result, "C", problem_space,
std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));
set_argument(result, "D", problem_space,
std::string(library::to_string(operation_desc.D.element)) + ":" + library::to_string(operation_desc.D.layout));
set_argument(result, "m", problem_space, m);
set_argument(result, "n", problem_space, n);
set_argument(result, "k", problem_space, k);
@ -537,8 +545,6 @@ Status GemmOperationProfiler::initialize_workspace(
problem_.batch_count * gemm_workspace_.problem_count
);
gemm_workspace_.Reference->copy_from_device(gemm_workspace_.C->data());
// NOTE: the leading non-batch strides are duplicated here for 3.0 API kernels
gemm_workspace_.arguments.problem_size = {int(problem_.m), int(problem_.n), int(problem_.k)};
gemm_workspace_.arguments.batch_count = problem_.batch_count;

View File

@ -270,17 +270,17 @@ int OperationProfiler::profile_all(
ProblemSpace::Iterator problem_it = problem_space.begin();
ProblemSpace::Iterator problem_end = problem_space.end();
bool continue_profiling = true, internal_error = false;
bool continue_profiling = true;
int retval = 0;
// For each problem in problem space
for (; continue_profiling && problem_it != problem_end; ++problem_it) {
ProblemSpace::Problem problem = problem_it.at();
report.next_problem();
// For each operation in manifest
for (auto const & operation_ptr : manifest) {
int matched_operation_count = 0;
for (auto const& operation_ptr : manifest) {
library::Operation const *operation = operation_ptr.get();
@ -292,8 +292,8 @@ int OperationProfiler::profile_all(
// Execute compatible cutlass operations if they satisfy the current device's compute capability
if (operation->description().kind == kind_ &&
operation->description().provider == library::Provider::kCUTLASS &&
options.device.compute_capability() >= min_cc &&
operation->description().provider == library::Provider::kCUTLASS &&
options.device.compute_capability() >= min_cc &&
options.device.compute_capability() <= max_cc) {
std::string operation_name(operation->description().name);
@ -320,7 +320,10 @@ int OperationProfiler::profile_all(
if (!filtered_by_name || !satisfies(operation->description(), problem_space, problem)) {
continue;
}
// we have found a kernel match, so increment the counter for match kernels
++matched_operation_count;
// A. Initialize configuration
Status status = this->initialize_configuration(
options,
@ -374,7 +377,6 @@ int OperationProfiler::profile_all(
//
// B. Verify CUTLASS
if (continue_profiling && options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
continue_profiling = this->verify_cutlass(
@ -426,10 +428,18 @@ int OperationProfiler::profile_all(
if (!continue_profiling) {
break;
}
}
}
// If we did not find any kernels that match our filters and error_on_no_match was set, report an error
if (options.profiling.error_on_no_match && matched_operation_count <= 0) {
#if !NDEBUG
std::cout << "Error: No matching kernels found with kernel selection filters [--error_on_no_match]" << std::endl;
#endif
retval = 1;
}
}
return internal_error ? 1 : 0;
return retval;
}
///////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -706,10 +706,12 @@ Options::Options(cutlass::CommandLine const &cmdline):
}
else if (cmdline.check_cmd_line_flag("kernels")) {
cmdline.get_cmd_line_arguments("kernels", operation_names);
profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
}
if (cmdline.check_cmd_line_flag("ignore-kernels")) {
cmdline.get_cmd_line_arguments("ignore-kernels", excluded_operation_names);
profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
}
// Prevent launches on the device for anything other than CUTLASS operation

View File

@ -196,6 +196,9 @@ public:
/// If true, profiling is actually conducted.
bool enabled;
/// If true, profiling returns an error code if no kernels are found to match the filters.
bool error_on_no_match = false;
/// List of providers of each functionality to be profiled
ProviderVector providers;