Updates for 3.1 (#932)
This commit is contained in:
@ -107,15 +107,17 @@ set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM --operation=Symm --provid
|
||||
cutlass_add_executable_tests(
|
||||
test_profiler cutlass_profiler
|
||||
DEPENDEES test_all
|
||||
TEST_COMMAND_OPTIONS
|
||||
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM
|
||||
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D
|
||||
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D
|
||||
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM
|
||||
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K
|
||||
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K
|
||||
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM
|
||||
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM
|
||||
TEST_COMMAND_OPTIONS
|
||||
GEMM
|
||||
CONV2D
|
||||
CONV3D
|
||||
SPGEMM
|
||||
RANK_K
|
||||
RANK_2K
|
||||
TRMM
|
||||
SYMM
|
||||
TEST_COMMAND_OPTIONS_PREFIX
|
||||
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_
|
||||
DISABLE_EXECUTABLE_INSTALL_RULE
|
||||
)
|
||||
|
||||
|
||||
@ -124,7 +124,7 @@ int CutlassProfiler::operator()() {
|
||||
options_.execution_mode == ExecutionMode::kTrace) {
|
||||
|
||||
// Profiles all operations
|
||||
profile_();
|
||||
return profile_();
|
||||
}
|
||||
else if (options_.execution_mode == ExecutionMode::kEnumerate) {
|
||||
// Enumerates all operations
|
||||
@ -157,7 +157,7 @@ int CutlassProfiler::profile_() {
|
||||
|
||||
if (result) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -462,6 +462,13 @@ size_t DeviceAllocation::bytes() const {
|
||||
|
||||
/// Copies from an equivalent-sized tensor in device memory
|
||||
void DeviceAllocation::copy_from_device(void const *ptr) {
|
||||
if (!bytes()) {
|
||||
#ifndef NDEBUG
|
||||
std::cout << "Skipping copy of size 0 allocation\n";
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyDeviceToDevice);
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("Failed device-to-device copy");
|
||||
@ -470,22 +477,43 @@ void DeviceAllocation::copy_from_device(void const *ptr) {
|
||||
|
||||
/// Copies from an equivalent-sized tensor in device memory
|
||||
void DeviceAllocation::copy_from_host(void const *ptr) {
|
||||
if (!bytes()) {
|
||||
#ifndef NDEBUG
|
||||
std::cout << "Skipping copy of size 0 allocation\n";
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyHostToDevice);
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("Failed device-to-device copy");
|
||||
throw std::runtime_error("Failed host-to-device copy");
|
||||
}
|
||||
}
|
||||
|
||||
/// Copies from an equivalent-sized tensor in device memory
|
||||
void DeviceAllocation::copy_to_host(void *ptr) {
|
||||
if (!bytes()) {
|
||||
#ifndef NDEBUG
|
||||
std::cout << "Skipping copy of size 0 allocation\n";
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
cudaError_t result = cudaMemcpy(ptr, data(), bytes(), cudaMemcpyDeviceToHost);
|
||||
if (result != cudaSuccess) {
|
||||
throw std::runtime_error("Failed device-to-device copy");
|
||||
throw std::runtime_error("Failed device-to-host copy");
|
||||
}
|
||||
}
|
||||
|
||||
void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
|
||||
if (!good()) {
|
||||
if (!bytes()) {
|
||||
#ifndef NDEBUG
|
||||
std::cout << "Skipping initialization of size 0 allocation\n";
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
if (!data()) {
|
||||
throw std::runtime_error("Attempting to initialize invalid allocation.");
|
||||
}
|
||||
|
||||
@ -690,7 +718,14 @@ void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
|
||||
}
|
||||
|
||||
void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
|
||||
if (!good()) {
|
||||
if (!bytes()) {
|
||||
#ifndef NDEBUG
|
||||
std::cout << "Skipping initialization of size 0 allocation\n";
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
if (!data()) {
|
||||
throw std::runtime_error("Attempting to initialize invalid allocation.");
|
||||
}
|
||||
|
||||
@ -699,7 +734,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
|
||||
switch (type_) {
|
||||
case library::NumericTypeID::kFE4M3:
|
||||
cutlass::reference::host::BlockFillRandom<cutlass::float_e4m3_t>(
|
||||
reinterpret_cast<cutlass::float_e4m3_t *>(pointer_),
|
||||
reinterpret_cast<cutlass::float_e4m3_t *>(host_data.data()),
|
||||
capacity_,
|
||||
seed,
|
||||
dist
|
||||
@ -707,7 +742,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
|
||||
break;
|
||||
case library::NumericTypeID::kFE5M2:
|
||||
cutlass::reference::host::BlockFillRandom<cutlass::float_e5m2_t>(
|
||||
reinterpret_cast<cutlass::float_e5m2_t *>(pointer_),
|
||||
reinterpret_cast<cutlass::float_e5m2_t *>(host_data.data()),
|
||||
capacity_,
|
||||
seed,
|
||||
dist
|
||||
@ -904,7 +939,14 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
|
||||
}
|
||||
|
||||
void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSizeInBits) {
|
||||
if (!good()) {
|
||||
if (!bytes()) {
|
||||
#ifndef NDEBUG
|
||||
std::cout << "Skipping initialization of size 0 allocation\n";
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
if (!data()) {
|
||||
throw std::runtime_error("Attempting to initialize invalid allocation.");
|
||||
}
|
||||
|
||||
@ -934,7 +976,14 @@ void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSiz
|
||||
}
|
||||
|
||||
void DeviceAllocation::initialize_random_sparsemeta_host(int seed, int MetaSizeInBits) {
|
||||
if (!good()) {
|
||||
if (!bytes()) {
|
||||
#ifndef NDEBUG
|
||||
std::cout << "Skipping initialization of size 0 allocation\n";
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
if (!data()) {
|
||||
throw std::runtime_error("Attempting to initialize invalid allocation.");
|
||||
}
|
||||
|
||||
|
||||
@ -68,6 +68,7 @@ GemmOperationProfiler::GemmOperationProfiler(Options const &options):
|
||||
{ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
|
||||
{ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
|
||||
{ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
|
||||
{ArgumentTypeID::kTensor, {"D"}, "Tensor storing the D output"},
|
||||
{ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
|
||||
{ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
|
||||
{ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "Variant of split K mode(serial, parallel)"},
|
||||
@ -206,6 +207,10 @@ Status GemmOperationProfiler::GemmProblem::parse(
|
||||
return Status::kErrorInvalidProblem;
|
||||
}
|
||||
|
||||
if (!tensor_description_satisfies(operation_desc.D, "D", problem_space, problem)) {
|
||||
return Status::kErrorInvalidProblem;
|
||||
}
|
||||
|
||||
if (!arg_as_scalar(
|
||||
this->alpha,
|
||||
operation_desc.element_epilogue,
|
||||
@ -307,6 +312,9 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
|
||||
set_argument(result, "C", problem_space,
|
||||
std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));
|
||||
|
||||
set_argument(result, "D", problem_space,
|
||||
std::string(library::to_string(operation_desc.D.element)) + ":" + library::to_string(operation_desc.D.layout));
|
||||
|
||||
set_argument(result, "m", problem_space, m);
|
||||
set_argument(result, "n", problem_space, n);
|
||||
set_argument(result, "k", problem_space, k);
|
||||
@ -537,8 +545,6 @@ Status GemmOperationProfiler::initialize_workspace(
|
||||
problem_.batch_count * gemm_workspace_.problem_count
|
||||
);
|
||||
|
||||
gemm_workspace_.Reference->copy_from_device(gemm_workspace_.C->data());
|
||||
|
||||
// NOTE: the leading non-batch strides are duplicated here for 3.0 API kernels
|
||||
gemm_workspace_.arguments.problem_size = {int(problem_.m), int(problem_.n), int(problem_.k)};
|
||||
gemm_workspace_.arguments.batch_count = problem_.batch_count;
|
||||
|
||||
@ -270,17 +270,17 @@ int OperationProfiler::profile_all(
|
||||
ProblemSpace::Iterator problem_it = problem_space.begin();
|
||||
ProblemSpace::Iterator problem_end = problem_space.end();
|
||||
|
||||
bool continue_profiling = true, internal_error = false;
|
||||
bool continue_profiling = true;
|
||||
int retval = 0;
|
||||
|
||||
// For each problem in problem space
|
||||
for (; continue_profiling && problem_it != problem_end; ++problem_it) {
|
||||
|
||||
ProblemSpace::Problem problem = problem_it.at();
|
||||
|
||||
report.next_problem();
|
||||
|
||||
// For each operation in manifest
|
||||
for (auto const & operation_ptr : manifest) {
|
||||
int matched_operation_count = 0;
|
||||
for (auto const& operation_ptr : manifest) {
|
||||
|
||||
library::Operation const *operation = operation_ptr.get();
|
||||
|
||||
@ -292,8 +292,8 @@ int OperationProfiler::profile_all(
|
||||
|
||||
// Execute compatible cutlass operations if they satisfy the current device's compute capability
|
||||
if (operation->description().kind == kind_ &&
|
||||
operation->description().provider == library::Provider::kCUTLASS &&
|
||||
options.device.compute_capability() >= min_cc &&
|
||||
operation->description().provider == library::Provider::kCUTLASS &&
|
||||
options.device.compute_capability() >= min_cc &&
|
||||
options.device.compute_capability() <= max_cc) {
|
||||
|
||||
std::string operation_name(operation->description().name);
|
||||
@ -320,7 +320,10 @@ int OperationProfiler::profile_all(
|
||||
if (!filtered_by_name || !satisfies(operation->description(), problem_space, problem)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// we have found a kernel match, so increment the counter for match kernels
|
||||
++matched_operation_count;
|
||||
|
||||
// A. Initialize configuration
|
||||
Status status = this->initialize_configuration(
|
||||
options,
|
||||
@ -374,7 +377,6 @@ int OperationProfiler::profile_all(
|
||||
//
|
||||
|
||||
// B. Verify CUTLASS
|
||||
|
||||
if (continue_profiling && options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
|
||||
|
||||
continue_profiling = this->verify_cutlass(
|
||||
@ -426,10 +428,18 @@ int OperationProfiler::profile_all(
|
||||
if (!continue_profiling) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we did not find any kernels that match our filters and error_on_no_match was set, report an error
|
||||
if (options.profiling.error_on_no_match && matched_operation_count <= 0) {
|
||||
#if !NDEBUG
|
||||
std::cout << "Error: No matching kernels found with kernel selection filters [--error_on_no_match]" << std::endl;
|
||||
#endif
|
||||
retval = 1;
|
||||
}
|
||||
}
|
||||
|
||||
return internal_error ? 1 : 0;
|
||||
return retval;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -706,10 +706,12 @@ Options::Options(cutlass::CommandLine const &cmdline):
|
||||
}
|
||||
else if (cmdline.check_cmd_line_flag("kernels")) {
|
||||
cmdline.get_cmd_line_arguments("kernels", operation_names);
|
||||
profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
|
||||
}
|
||||
|
||||
if (cmdline.check_cmd_line_flag("ignore-kernels")) {
|
||||
cmdline.get_cmd_line_arguments("ignore-kernels", excluded_operation_names);
|
||||
profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
|
||||
}
|
||||
|
||||
// Prevent launches on the device for anything other than CUTLASS operation
|
||||
|
||||
@ -196,6 +196,9 @@ public:
|
||||
/// If true, profiling is actually conducted.
|
||||
bool enabled;
|
||||
|
||||
/// If true, profiling returns an error code if no kernels are found to match the filters.
|
||||
bool error_on_no_match = false;
|
||||
|
||||
/// List of providers of each functionality to be profiled
|
||||
ProviderVector providers;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user