Fix SM90 beta=1 hang and stream-K launch errors (#2172)

* Fix stream-K occupancy calculation

* Fix beta=1 hang
This commit is contained in:
Jack Kosaian
2025-03-13 13:07:37 -05:00
committed by GitHub
parent 06e560d98a
commit 6c6b78550e
4 changed files with 93 additions and 64 deletions

View File

@ -661,9 +661,9 @@ public:
// Converge before issuing tensormap fence release since fence is aligned
__syncwarp();
collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
}
load_order_barrier.wait();
load_order_barrier.wait();
}
while (work_tile_info.is_valid()) {
int32_t curr_batch = work_tile_info.L_idx;

View File

@ -693,10 +693,10 @@ public:
// Converge before issuing tensormap fence release since fence is aligned
__syncwarp();
collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
load_order_barrier.wait();
}
load_order_barrier.wait();
while (work_tile_info.is_valid()) {
int32_t curr_batch = work_tile_info.L_idx;

View File

@ -548,8 +548,13 @@ public:
// unflushed global memory prior to this instruction
cutlass::arch::wait_on_dependent_grids();
load_order_barrier.wait();
bool do_load_order_wait = true;
while (work_tile_info.is_valid()) {
if (do_load_order_wait) {
load_order_barrier.wait();
do_load_order_wait = false;
}
// Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));

View File

@ -188,8 +188,8 @@ struct PersistentTileSchedulerSm90Params {
KernelHardwareInfo hw_info,
int max_swizzle_size,
RasterOrderOptions raster_order_option,
bool truncate_by_problem_size=true
, bool bypass_occupancy_calculation=false
bool truncate_by_problem_size=true,
bool bypass_sm90_occupancy_calculation=false
) {
dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
@ -199,8 +199,8 @@ struct PersistentTileSchedulerSm90Params {
hw_info,
max_swizzle_size,
raster_order_option,
truncate_by_problem_size
, bypass_occupancy_calculation
truncate_by_problem_size,
bypass_sm90_occupancy_calculation
);
}
@ -215,8 +215,8 @@ struct PersistentTileSchedulerSm90Params {
KernelHardwareInfo hw_info,
int max_swizzle_size,
RasterOrderOptions raster_order_option,
bool truncate_by_problem_size=true
, bool bypass_occupancy_calculation=false
bool truncate_by_problem_size=true,
bool bypass_sm90_occupancy_calculation=false
) {
int const sm_count = hw_info.sm_count;
@ -267,24 +267,28 @@ struct PersistentTileSchedulerSm90Params {
// already calculated using cudaOccupancyMaxActiveClusters
else if (max_active_clusters != 0 && max_active_clusters * cluster_size <= sm_count) {
if (raster_order == RasterOrder::AlongN) {
launch_grid.y = max_active_clusters * cluster_shape.n();
launch_grid.y = possibly_truncate(
max_active_clusters * cluster_shape.n(),
problem_blocks_total / cluster_shape.m());
}
else {
launch_grid.x = max_active_clusters * cluster_shape.m();
launch_grid.x = possibly_truncate(
max_active_clusters * cluster_shape.m(),
problem_blocks_total / cluster_shape.n());
}
CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using cudaOccupancyMaxActiveClusters = "
"(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
}
else {
int cta_per_device = sm_count;
if (!bypass_occupancy_calculation) {
/*
* Optimal grid size calculation is based on
* GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
* Hence, maximum SMs per GPC = 18
*/
constexpr int max_sm_per_gpc = 18;
cta_per_device = get_max_cta_occupancy(max_sm_per_gpc, cluster_shape, sm_count);
if (!bypass_sm90_occupancy_calculation) {
/*
* Optimal grid size calculation is based on
* GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
* Hence, maximum SMs per GPC = 18
*/
constexpr int max_sm_per_gpc = 18;
cta_per_device = get_max_cta_occupancy(max_sm_per_gpc, cluster_shape, sm_count);
}
if (raster_order == RasterOrder::AlongN) {
@ -573,8 +577,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
ReductionMode reduction_mode,
DecompositionMode decomposition_mode,
void* workspace,
const uint32_t epilogue_subtile = 1u
, uint32_t ktile_start_alignment_count = 1u
const uint32_t epilogue_subtile = 1u,
uint32_t ktile_start_alignment_count = 1u,
bool bypass_sm90_occupancy_calculation=false
) {
dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(
problem_shape, tile_shape, cluster_shape);
@ -593,8 +598,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
reduction_mode,
decomposition_mode,
workspace,
epilogue_subtile
, ktile_start_alignment_count
epilogue_subtile,
ktile_start_alignment_count,
bypass_sm90_occupancy_calculation
);
}
@ -613,8 +619,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
ReductionMode reduction_mode,
DecompositionMode decomposition_mode,
void* workspace,
const uint32_t epilogue_subtile = 1
, uint32_t ktile_start_alignment_count = 1u
const uint32_t epilogue_subtile = 1,
uint32_t ktile_start_alignment_count = 1u,
bool bypass_sm90_occupancy_calculation=false
) {
#if !defined(__CUDACC_RTC__)
@ -650,7 +657,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
raster_order_option,
decomposition_mode,
reduction_mode,
epilogue_subtile
epilogue_subtile,
ktile_start_alignment_count,
bypass_sm90_occupancy_calculation
);
}
@ -741,19 +750,19 @@ struct PersistentTileSchedulerSm90StreamKParams {
// Stream-K kernel use below function to set stream-K feature related parameters to choose
// optimal/customized decomposition mode.
void stream_k_heuristic(
UnderlyingParams underlying_params,
dim3 problem_blocks,
uint32_t k_tiles_per_output_tile,
GemmCoord cluster_shape,
KernelHardwareInfo hw_info,
int splits,
int max_swizzle,
RasterOrderOptions raster_order_option,
DecompositionMode decomposition_mode,
ReductionMode reduction_mode,
const uint32_t epilogue_subtile = 1
, uint32_t ktile_start_alignment_count = 1u
) {
UnderlyingParams underlying_params,
dim3 problem_blocks,
uint32_t k_tiles_per_output_tile,
GemmCoord cluster_shape,
KernelHardwareInfo hw_info,
int splits,
int max_swizzle,
RasterOrderOptions raster_order_option,
DecompositionMode decomposition_mode,
ReductionMode reduction_mode,
const uint32_t epilogue_subtile = 1,
uint32_t ktile_start_alignment_count = 1u,
bool bypass_sm90_occupancy_calculation=false) {
uint32_t groups = 0;
uint32_t sk_tiles = 0;
uint64_t sk_units = 0;
@ -785,8 +794,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
raster_order_option,
decomposition_mode,
reduction_mode,
epilogue_subtile
, ktile_start_alignment_count
epilogue_subtile,
ktile_start_alignment_count,
bypass_sm90_occupancy_calculation
);
// Given heuristic_mode returned from the heuristic() method, set params fields.
@ -809,8 +819,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
cluster_shape,
splits,
epilogue_subtile,
reduction_mode
, ktile_start_alignment_count
reduction_mode,
ktile_start_alignment_count
);
}
@ -835,8 +845,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
RasterOrderOptions raster_order_option,
DecompositionMode decomposition_mode,
ReductionMode reduction_mode,
uint32_t epilogue_subtile
, uint32_t ktile_start_alignment_count
uint32_t epilogue_subtile,
uint32_t ktile_start_alignment_count,
bool bypass_sm90_occupancy_calculation=false
) {
// Get block numbers in m, n and l dimensions
@ -863,7 +874,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
cluster_shape,
hw_info,
max_swizzle,
raster_order_option
raster_order_option,
bypass_sm90_occupancy_calculation
);
uint64_t ctas_per_wave = grid.x * grid.y;
cluster_size = cluster_shape.m() * cluster_shape.n();
@ -875,8 +887,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
ctas_per_wave,
cluster_size,
k_tiles_per_output_tile,
decomposition_mode
, ctas_per_wave_in_full_clusters
decomposition_mode,
ctas_per_wave_in_full_clusters
);
uint64_t dp_tiles = output_tiles - sk_tiles;
// Calculate the number of work units covering the data-parallel and stream-K tiles.
@ -1035,7 +1047,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
GemmCoord cluster_shape,
KernelHardwareInfo hw_info,
int max_swizzle_size,
RasterOrderOptions raster_order_option
RasterOrderOptions raster_order_option,
bool bypass_sm90_occupancy_calculation=false
) {
dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
@ -1045,7 +1058,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
cluster_shape,
hw_info,
max_swizzle_size,
raster_order_option
raster_order_option,
bypass_sm90_occupancy_calculation
);
}
@ -1059,7 +1073,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
GemmCoord cluster_shape,
KernelHardwareInfo hw_info,
int max_swizzle_size,
RasterOrderOptions raster_order_option
RasterOrderOptions raster_order_option,
bool bypass_sm90_occupancy_calculation=false
) {
// Call into the underlying get_grid_shape method, but do not allow the grid shape returned
@ -1070,8 +1085,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
hw_info,
max_swizzle_size,
raster_order_option,
/* truncate_by_problem_size = */false
/* bypass_occupancy_calculation = */, true
/* truncate_by_problem_size = */false,
bypass_sm90_occupancy_calculation
);
}
@ -1186,7 +1201,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
uint32_t accumulator_bits,
uint32_t epilogue_subtile = 1,
uint32_t num_accumulator_mtxs = 1,
uint32_t ktile_start_alignment_count = 1) {
uint32_t ktile_start_alignment_count = 1,
bool bypass_sm90_occupancy_calculation=false) {
auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle);
problem_blocks.x = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
@ -1216,7 +1232,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
cluster_shape,
new_hw_info,
max_swizzle,
raster_order_option
raster_order_option,
bypass_sm90_occupancy_calculation
);
uint64_t ctas_per_wave = grid.x * grid.y;
uint64_t cluster_size = cluster_shape.m() * cluster_shape.n();
@ -1347,7 +1364,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
uint32_t element_accumulator_bits,
uint32_t epilogue_subtile = 1,
uint32_t num_accumulator_mtxs = 1,
uint32_t ktile_start_alignment_count = 1) {
uint32_t ktile_start_alignment_count = 1,
bool bypass_sm90_occupancy_calculation=false) {
size_t barrier_workspace_size = 0;
size_t reduction_workspace_size = 0;
@ -1371,7 +1389,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
element_accumulator_bits,
epilogue_subtile,
num_accumulator_mtxs,
ktile_start_alignment_count
ktile_start_alignment_count,
bypass_sm90_occupancy_calculation
);
#endif
@ -1449,7 +1468,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
uint32_t epilogue_subtile = 1,
uint32_t num_accumulator_mtxs = 1,
CudaHostAdapter* cuda_adapter = nullptr,
uint32_t ktile_start_alignment_count = 1) {
uint32_t ktile_start_alignment_count = 1,
bool bypass_sm90_occupancy_calculation=false) {
#if !defined(__CUDACC_RTC__)
uint64_t barrier_workspace_size = 0;
@ -1473,7 +1493,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
element_accumulator_bits,
epilogue_subtile,
num_accumulator_mtxs,
ktile_start_alignment_count
ktile_start_alignment_count,
bypass_sm90_occupancy_calculation
);
if (barrier_workspace_size > 0) {
@ -2165,7 +2186,8 @@ struct PersistentTileSchedulerSm100StreamKParams {
decomposition_mode,
workspace,
/*epilogue_subtile=*/1,
ktile_start_alignment_count
ktile_start_alignment_count,
/*bypass_sm90_occupancy_calculation=*/true
);
log_swizzle_size_ = sk_params_.log_swizzle_size_;
@ -2334,7 +2356,8 @@ struct PersistentTileSchedulerSm100StreamKParams {
element_accumulator_bits,
epilogue_subtile,
num_accumulator_mtxs,
ktile_start_alignment_count
ktile_start_alignment_count,
/*bypass_sm90_occupancy_calculation=*/true
);
}
@ -2431,7 +2454,8 @@ struct PersistentTileSchedulerSm100StreamKParams {
epilogue_subtile,
num_accumulator_mtxs,
cuda_adapter,
ktile_start_alignment_count
ktile_start_alignment_count,
/*bypass_sm90_occupancy_calculation=*/true
);
}
};