From 6c6b78550e0752309c1f85dc9c1f6e636c8d751c Mon Sep 17 00:00:00 2001 From: Jack Kosaian Date: Thu, 13 Mar 2025 13:07:37 -0500 Subject: [PATCH] Fix SM90 beta=1 hang and stream-K launch errors (#2172) * Fix stream-K occupancy calculation * Fix beta=1 hang --- ..._array_tma_warpspecialized_cooperative.hpp | 4 +- ...emm_array_tma_warpspecialized_pingpong.hpp | 4 +- ...sm90_gemm_tma_warpspecialized_pingpong.hpp | 7 +- .../gemm/kernel/tile_scheduler_params.h | 142 ++++++++++-------- 4 files changed, 93 insertions(+), 64 deletions(-) diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp index 20523304..73ee5055 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp @@ -661,9 +661,9 @@ public: // Converge before issuing tensormap fence release since fence is aligned __syncwarp(); collective_epilogue.template tensormaps_cp_fence_release(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0); - } - load_order_barrier.wait(); + load_order_barrier.wait(); + } while (work_tile_info.is_valid()) { int32_t curr_batch = work_tile_info.L_idx; diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp index 3515f4d2..882974ee 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp @@ -693,10 +693,10 @@ public: // Converge before issuing tensormap fence release since fence is aligned __syncwarp(); collective_epilogue.template tensormaps_cp_fence_release(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0); + + load_order_barrier.wait(); } - load_order_barrier.wait(); - while (work_tile_info.is_valid()) { int32_t curr_batch = work_tile_info.L_idx; diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp index cba20b5c..9c5d9d47 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp @@ -548,8 +548,13 @@ public: // unflushed global memory prior to this instruction cutlass::arch::wait_on_dependent_grids(); - load_order_barrier.wait(); + bool do_load_order_wait = true; while (work_tile_info.is_valid()) { + if (do_load_order_wait) { + load_order_barrier.wait(); + do_load_order_wait = false; + } + // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl)); auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl)); diff --git a/include/cutlass/gemm/kernel/tile_scheduler_params.h b/include/cutlass/gemm/kernel/tile_scheduler_params.h index e1579d3f..7ee5b3f8 100644 --- a/include/cutlass/gemm/kernel/tile_scheduler_params.h +++ b/include/cutlass/gemm/kernel/tile_scheduler_params.h @@ -188,8 +188,8 @@ struct PersistentTileSchedulerSm90Params { KernelHardwareInfo hw_info, int max_swizzle_size, RasterOrderOptions raster_order_option, - bool truncate_by_problem_size=true - , bool bypass_occupancy_calculation=false + bool truncate_by_problem_size=true, + bool bypass_sm90_occupancy_calculation=false ) { dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape); @@ -199,8 +199,8 @@ struct PersistentTileSchedulerSm90Params { hw_info, max_swizzle_size, raster_order_option, - truncate_by_problem_size - , bypass_occupancy_calculation + truncate_by_problem_size, + bypass_sm90_occupancy_calculation ); } @@ -215,8 +215,8 @@ struct PersistentTileSchedulerSm90Params { KernelHardwareInfo hw_info, int max_swizzle_size, RasterOrderOptions raster_order_option, - bool truncate_by_problem_size=true - , bool bypass_occupancy_calculation=false + bool truncate_by_problem_size=true, + bool bypass_sm90_occupancy_calculation=false ) { int const sm_count = hw_info.sm_count; @@ -267,24 +267,28 @@ struct PersistentTileSchedulerSm90Params { // already calculated using cudaOccupancyMaxActiveClusters else if (max_active_clusters != 0 && max_active_clusters * cluster_size <= sm_count) { if (raster_order == RasterOrder::AlongN) { - launch_grid.y = max_active_clusters * cluster_shape.n(); + launch_grid.y = possibly_truncate( + max_active_clusters * cluster_shape.n(), + problem_blocks_total / cluster_shape.m()); } else { - launch_grid.x = max_active_clusters * cluster_shape.m(); + launch_grid.x = possibly_truncate( + max_active_clusters * cluster_shape.m(), + problem_blocks_total / cluster_shape.n()); } CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using cudaOccupancyMaxActiveClusters = " "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n"); } else { int cta_per_device = sm_count; - if (!bypass_occupancy_calculation) { - /* - * Optimal grid size calculation is based on - * GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU - * Hence, maximum SMs per GPC = 18 - */ - constexpr int max_sm_per_gpc = 18; - cta_per_device = get_max_cta_occupancy(max_sm_per_gpc, cluster_shape, sm_count); + if (!bypass_sm90_occupancy_calculation) { + /* + * Optimal grid size calculation is based on + * GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU + * Hence, maximum SMs per GPC = 18 + */ + constexpr int max_sm_per_gpc = 18; + cta_per_device = get_max_cta_occupancy(max_sm_per_gpc, cluster_shape, sm_count); } if (raster_order == RasterOrder::AlongN) { @@ -573,8 +577,9 @@ struct PersistentTileSchedulerSm90StreamKParams { ReductionMode reduction_mode, DecompositionMode decomposition_mode, void* workspace, - const uint32_t epilogue_subtile = 1u - , uint32_t ktile_start_alignment_count = 1u + const uint32_t epilogue_subtile = 1u, + uint32_t ktile_start_alignment_count = 1u, + bool bypass_sm90_occupancy_calculation=false ) { dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl( problem_shape, tile_shape, cluster_shape); @@ -593,8 +598,9 @@ struct PersistentTileSchedulerSm90StreamKParams { reduction_mode, decomposition_mode, workspace, - epilogue_subtile - , ktile_start_alignment_count + epilogue_subtile, + ktile_start_alignment_count, + bypass_sm90_occupancy_calculation ); } @@ -613,8 +619,9 @@ struct PersistentTileSchedulerSm90StreamKParams { ReductionMode reduction_mode, DecompositionMode decomposition_mode, void* workspace, - const uint32_t epilogue_subtile = 1 - , uint32_t ktile_start_alignment_count = 1u + const uint32_t epilogue_subtile = 1, + uint32_t ktile_start_alignment_count = 1u, + bool bypass_sm90_occupancy_calculation=false ) { #if !defined(__CUDACC_RTC__) @@ -650,7 +657,9 @@ struct PersistentTileSchedulerSm90StreamKParams { raster_order_option, decomposition_mode, reduction_mode, - epilogue_subtile + epilogue_subtile, + ktile_start_alignment_count, + bypass_sm90_occupancy_calculation ); } @@ -741,19 +750,19 @@ struct PersistentTileSchedulerSm90StreamKParams { // Stream-K kernel use below function to set stream-K feature related parameters to choose // optimal/customized decomposition mode. void stream_k_heuristic( - UnderlyingParams underlying_params, - dim3 problem_blocks, - uint32_t k_tiles_per_output_tile, - GemmCoord cluster_shape, - KernelHardwareInfo hw_info, - int splits, - int max_swizzle, - RasterOrderOptions raster_order_option, - DecompositionMode decomposition_mode, - ReductionMode reduction_mode, - const uint32_t epilogue_subtile = 1 - , uint32_t ktile_start_alignment_count = 1u - ) { + UnderlyingParams underlying_params, + dim3 problem_blocks, + uint32_t k_tiles_per_output_tile, + GemmCoord cluster_shape, + KernelHardwareInfo hw_info, + int splits, + int max_swizzle, + RasterOrderOptions raster_order_option, + DecompositionMode decomposition_mode, + ReductionMode reduction_mode, + const uint32_t epilogue_subtile = 1, + uint32_t ktile_start_alignment_count = 1u, + bool bypass_sm90_occupancy_calculation=false) { uint32_t groups = 0; uint32_t sk_tiles = 0; uint64_t sk_units = 0; @@ -785,8 +794,9 @@ struct PersistentTileSchedulerSm90StreamKParams { raster_order_option, decomposition_mode, reduction_mode, - epilogue_subtile - , ktile_start_alignment_count + epilogue_subtile, + ktile_start_alignment_count, + bypass_sm90_occupancy_calculation ); // Given heuristic_mode returned from the heuristic() method, set params fields. @@ -809,8 +819,8 @@ struct PersistentTileSchedulerSm90StreamKParams { cluster_shape, splits, epilogue_subtile, - reduction_mode - , ktile_start_alignment_count + reduction_mode, + ktile_start_alignment_count ); } @@ -835,8 +845,9 @@ struct PersistentTileSchedulerSm90StreamKParams { RasterOrderOptions raster_order_option, DecompositionMode decomposition_mode, ReductionMode reduction_mode, - uint32_t epilogue_subtile - , uint32_t ktile_start_alignment_count + uint32_t epilogue_subtile, + uint32_t ktile_start_alignment_count, + bool bypass_sm90_occupancy_calculation=false ) { // Get block numbers in m, n and l dimensions @@ -863,7 +874,8 @@ struct PersistentTileSchedulerSm90StreamKParams { cluster_shape, hw_info, max_swizzle, - raster_order_option + raster_order_option, + bypass_sm90_occupancy_calculation ); uint64_t ctas_per_wave = grid.x * grid.y; cluster_size = cluster_shape.m() * cluster_shape.n(); @@ -875,8 +887,8 @@ struct PersistentTileSchedulerSm90StreamKParams { ctas_per_wave, cluster_size, k_tiles_per_output_tile, - decomposition_mode - , ctas_per_wave_in_full_clusters + decomposition_mode, + ctas_per_wave_in_full_clusters ); uint64_t dp_tiles = output_tiles - sk_tiles; // Calculate the number of work units covering the data-parallel and stream-K tiles. @@ -1035,7 +1047,8 @@ struct PersistentTileSchedulerSm90StreamKParams { GemmCoord cluster_shape, KernelHardwareInfo hw_info, int max_swizzle_size, - RasterOrderOptions raster_order_option + RasterOrderOptions raster_order_option, + bool bypass_sm90_occupancy_calculation=false ) { dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape); @@ -1045,7 +1058,8 @@ struct PersistentTileSchedulerSm90StreamKParams { cluster_shape, hw_info, max_swizzle_size, - raster_order_option + raster_order_option, + bypass_sm90_occupancy_calculation ); } @@ -1059,7 +1073,8 @@ struct PersistentTileSchedulerSm90StreamKParams { GemmCoord cluster_shape, KernelHardwareInfo hw_info, int max_swizzle_size, - RasterOrderOptions raster_order_option + RasterOrderOptions raster_order_option, + bool bypass_sm90_occupancy_calculation=false ) { // Call into the underlying get_grid_shape method, but do not allow the grid shape returned @@ -1070,8 +1085,8 @@ struct PersistentTileSchedulerSm90StreamKParams { hw_info, max_swizzle_size, raster_order_option, - /* truncate_by_problem_size = */false - /* bypass_occupancy_calculation = */, true + /* truncate_by_problem_size = */false, + bypass_sm90_occupancy_calculation ); } @@ -1186,7 +1201,8 @@ struct PersistentTileSchedulerSm90StreamKParams { uint32_t accumulator_bits, uint32_t epilogue_subtile = 1, uint32_t num_accumulator_mtxs = 1, - uint32_t ktile_start_alignment_count = 1) { + uint32_t ktile_start_alignment_count = 1, + bool bypass_sm90_occupancy_calculation=false) { auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle); problem_blocks.x = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m()); @@ -1216,7 +1232,8 @@ struct PersistentTileSchedulerSm90StreamKParams { cluster_shape, new_hw_info, max_swizzle, - raster_order_option + raster_order_option, + bypass_sm90_occupancy_calculation ); uint64_t ctas_per_wave = grid.x * grid.y; uint64_t cluster_size = cluster_shape.m() * cluster_shape.n(); @@ -1347,7 +1364,8 @@ struct PersistentTileSchedulerSm90StreamKParams { uint32_t element_accumulator_bits, uint32_t epilogue_subtile = 1, uint32_t num_accumulator_mtxs = 1, - uint32_t ktile_start_alignment_count = 1) { + uint32_t ktile_start_alignment_count = 1, + bool bypass_sm90_occupancy_calculation=false) { size_t barrier_workspace_size = 0; size_t reduction_workspace_size = 0; @@ -1371,7 +1389,8 @@ struct PersistentTileSchedulerSm90StreamKParams { element_accumulator_bits, epilogue_subtile, num_accumulator_mtxs, - ktile_start_alignment_count + ktile_start_alignment_count, + bypass_sm90_occupancy_calculation ); #endif @@ -1449,7 +1468,8 @@ struct PersistentTileSchedulerSm90StreamKParams { uint32_t epilogue_subtile = 1, uint32_t num_accumulator_mtxs = 1, CudaHostAdapter* cuda_adapter = nullptr, - uint32_t ktile_start_alignment_count = 1) { + uint32_t ktile_start_alignment_count = 1, + bool bypass_sm90_occupancy_calculation=false) { #if !defined(__CUDACC_RTC__) uint64_t barrier_workspace_size = 0; @@ -1473,7 +1493,8 @@ struct PersistentTileSchedulerSm90StreamKParams { element_accumulator_bits, epilogue_subtile, num_accumulator_mtxs, - ktile_start_alignment_count + ktile_start_alignment_count, + bypass_sm90_occupancy_calculation ); if (barrier_workspace_size > 0) { @@ -2165,7 +2186,8 @@ struct PersistentTileSchedulerSm100StreamKParams { decomposition_mode, workspace, /*epilogue_subtile=*/1, - ktile_start_alignment_count + ktile_start_alignment_count, + /*bypass_sm90_occupancy_calculation=*/true ); log_swizzle_size_ = sk_params_.log_swizzle_size_; @@ -2334,7 +2356,8 @@ struct PersistentTileSchedulerSm100StreamKParams { element_accumulator_bits, epilogue_subtile, num_accumulator_mtxs, - ktile_start_alignment_count + ktile_start_alignment_count, + /*bypass_sm90_occupancy_calculation=*/true ); } @@ -2431,7 +2454,8 @@ struct PersistentTileSchedulerSm100StreamKParams { epilogue_subtile, num_accumulator_mtxs, cuda_adapter, - ktile_start_alignment_count + ktile_start_alignment_count, + /*bypass_sm90_occupancy_calculation=*/true ); } };