Blockwise/Groupwise kernel improvement and programatic dependent launch enablement (#2161)

Co-authored-by: dePaul Miller <23461061+depaulmillz@users.noreply.github.com>
This commit is contained in:
dePaul Miller
2025-03-10 11:36:11 -07:00
committed by GitHub
parent df18f5e4f5
commit 06e560d98a
3 changed files with 24 additions and 3 deletions

View File

@ -382,7 +382,21 @@ endif()
if (CUTLASS_ENABLE_GDC_FOR_SM90)
message(STATUS "Grid Dependency Control (GDC) is enabled for SM90 kernels (required for programmatic dependent launches).")
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1)
list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1)
endif()
if (NOT DEFINED CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT)
set(CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT ON)
endif()
set(CUTLASS_ENABLE_GDC_FOR_SM100
${CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT}
CACHE BOOL
"Enables Grid Dependency Control (GDC) for SM100 kernels (required for PDL).")
if (CUTLASS_ENABLE_GDC_FOR_SM100)
message(STATUS "Grid Dependency Control (GDC) is enabled for SM100 kernels (required for programmatic dependent launches).")
list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM100=1)
endif()
set(CUTLASS_ENABLE_SYNCLOG OFF CACHE BOOL "Enable synchronization event logging for race condition debugging. WARNING: This redefines __syncthreads() and __syncwarp() in all downstream code!")

View File

@ -46,6 +46,11 @@
defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))
#define CUTLASS_GDC_ENABLED
#endif
#if (defined(CUTLASS_ENABLE_GDC_FOR_SM100) && \
__CUDACC_VER_MAJOR__ >= 12 && \
defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1000 && defined(__CUDA_ARCH_FEAT_SM100_ALL))
#define CUTLASS_GDC_ENABLED
#endif
#endif
namespace cutlass {

View File

@ -281,13 +281,15 @@ struct CollectiveMma<
static constexpr int LeadingScalesPerTileSFA = size<0,1>(LayoutSFA{}.stride()) == 1 ? ScaleMsPerTile : ScaleKsPerTile;
using ScaleCopyTypeA = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementAccumulator)) * LeadingScalesPerTileSFA, 16)>;
using SmemScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ScaleCopyTypeA>, ElementAccumulator>;
static constexpr int ElementsPerSFACopy = static_cast<int>(sizeof(ScaleCopyTypeA) / sizeof(ElementAccumulator));
static constexpr int LeadingScalesPerTileSFB = size<0,1>(LayoutSFB{}.stride()) == 1 ? ScaleNsPerTile : ScaleKsPerTile;
using ScaleCopyTypeB = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementAccumulator)) * LeadingScalesPerTileSFB, 16)>;
using SmemScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ScaleCopyTypeB>, ElementAccumulator>;
static constexpr int ElementsPerSFBCopy = static_cast<int>(sizeof(ScaleCopyTypeB) / sizeof(ElementAccumulator));
using TiledCopyScaleA = decltype(make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_1>>{}, Layout<Shape<Int<LeadingScalesPerTileSFA>>>{}));
using TiledCopyScaleB = decltype(make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<Int<LeadingScalesPerTileSFB>>>{}));
using TiledCopyScaleA = decltype(make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ElementsPerSFACopy>>>{}));
using TiledCopyScaleB = decltype(make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ElementsPerSFBCopy>>>{}));
struct SharedStorage {
struct TensorStorage : cute::aligned_struct<128, _0> {