Blockwise/Groupwise kernel improvement and programatic dependent launch enablement (#2161)

Co-authored-by: dePaul Miller <23461061+depaulmillz@users.noreply.github.com>
2025-03-10 11:36:11 -07:00
parent df18f5e4f5
commit 06e560d98a
3 changed files with 24 additions and 3 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -382,7 +382,21 @@ endif()

 if (CUTLASS_ENABLE_GDC_FOR_SM90)
  message(STATUS "Grid Dependency Control (GDC) is enabled for SM90 kernels (required for programmatic dependent launches).")
-  list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1)
+  list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1)
+endif()
+
+if (NOT DEFINED CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT)
+  set(CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT ON)
+endif()
+
+set(CUTLASS_ENABLE_GDC_FOR_SM100
+    ${CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT}
+    CACHE BOOL
+    "Enables Grid Dependency Control (GDC) for SM100 kernels (required for PDL).")
+
+if (CUTLASS_ENABLE_GDC_FOR_SM100)
+  message(STATUS "Grid Dependency Control (GDC) is enabled for SM100 kernels (required for programmatic dependent launches).")
+  list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM100=1)
 endif()

 set(CUTLASS_ENABLE_SYNCLOG OFF CACHE BOOL "Enable synchronization event logging for race condition debugging. WARNING: This redefines __syncthreads() and __syncwarp() in all downstream code!")
--- a/include/cutlass/arch/grid_dependency_control.h
+++ b/include/cutlass/arch/grid_dependency_control.h
@ -46,6 +46,11 @@
     defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))
    #define CUTLASS_GDC_ENABLED
  #endif
+  #if (defined(CUTLASS_ENABLE_GDC_FOR_SM100) && \
+     __CUDACC_VER_MAJOR__ >= 12 && \
+     defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1000 && defined(__CUDA_ARCH_FEAT_SM100_ALL))
+    #define CUTLASS_GDC_ENABLED
+  #endif
 #endif

 namespace cutlass {
--- a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
@ -281,13 +281,15 @@ struct CollectiveMma<
  static constexpr int LeadingScalesPerTileSFA = size<0,1>(LayoutSFA{}.stride()) == 1 ? ScaleMsPerTile : ScaleKsPerTile;
  using ScaleCopyTypeA = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementAccumulator)) * LeadingScalesPerTileSFA, 16)>;
  using SmemScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ScaleCopyTypeA>, ElementAccumulator>;
+  static constexpr int ElementsPerSFACopy = static_cast<int>(sizeof(ScaleCopyTypeA) / sizeof(ElementAccumulator));

  static constexpr int LeadingScalesPerTileSFB = size<0,1>(LayoutSFB{}.stride()) == 1 ? ScaleNsPerTile : ScaleKsPerTile;
  using ScaleCopyTypeB = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementAccumulator)) * LeadingScalesPerTileSFB, 16)>;
  using SmemScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ScaleCopyTypeB>, ElementAccumulator>;
+  static constexpr int ElementsPerSFBCopy = static_cast<int>(sizeof(ScaleCopyTypeB) / sizeof(ElementAccumulator));

-  using TiledCopyScaleA = decltype(make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_1>>{}, Layout<Shape<Int<LeadingScalesPerTileSFA>>>{}));
-  using TiledCopyScaleB = decltype(make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<Int<LeadingScalesPerTileSFB>>>{}));
+  using TiledCopyScaleA = decltype(make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ElementsPerSFACopy>>>{}));
+  using TiledCopyScaleB = decltype(make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ElementsPerSFBCopy>>>{}));

  struct SharedStorage {
    struct TensorStorage : cute::aligned_struct<128, _0> {