Blockwise/Groupwise kernel improvement and programatic dependent launch enablement (#2161)
Co-authored-by: dePaul Miller <23461061+depaulmillz@users.noreply.github.com>
This commit is contained in:
@ -382,7 +382,21 @@ endif()
|
||||
|
||||
if (CUTLASS_ENABLE_GDC_FOR_SM90)
|
||||
message(STATUS "Grid Dependency Control (GDC) is enabled for SM90 kernels (required for programmatic dependent launches).")
|
||||
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1)
|
||||
list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1)
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT)
|
||||
set(CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT ON)
|
||||
endif()
|
||||
|
||||
set(CUTLASS_ENABLE_GDC_FOR_SM100
|
||||
${CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT}
|
||||
CACHE BOOL
|
||||
"Enables Grid Dependency Control (GDC) for SM100 kernels (required for PDL).")
|
||||
|
||||
if (CUTLASS_ENABLE_GDC_FOR_SM100)
|
||||
message(STATUS "Grid Dependency Control (GDC) is enabled for SM100 kernels (required for programmatic dependent launches).")
|
||||
list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM100=1)
|
||||
endif()
|
||||
|
||||
set(CUTLASS_ENABLE_SYNCLOG OFF CACHE BOOL "Enable synchronization event logging for race condition debugging. WARNING: This redefines __syncthreads() and __syncwarp() in all downstream code!")
|
||||
|
||||
@ -46,6 +46,11 @@
|
||||
defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))
|
||||
#define CUTLASS_GDC_ENABLED
|
||||
#endif
|
||||
#if (defined(CUTLASS_ENABLE_GDC_FOR_SM100) && \
|
||||
__CUDACC_VER_MAJOR__ >= 12 && \
|
||||
defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1000 && defined(__CUDA_ARCH_FEAT_SM100_ALL))
|
||||
#define CUTLASS_GDC_ENABLED
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace cutlass {
|
||||
|
||||
@ -281,13 +281,15 @@ struct CollectiveMma<
|
||||
static constexpr int LeadingScalesPerTileSFA = size<0,1>(LayoutSFA{}.stride()) == 1 ? ScaleMsPerTile : ScaleKsPerTile;
|
||||
using ScaleCopyTypeA = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementAccumulator)) * LeadingScalesPerTileSFA, 16)>;
|
||||
using SmemScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ScaleCopyTypeA>, ElementAccumulator>;
|
||||
static constexpr int ElementsPerSFACopy = static_cast<int>(sizeof(ScaleCopyTypeA) / sizeof(ElementAccumulator));
|
||||
|
||||
static constexpr int LeadingScalesPerTileSFB = size<0,1>(LayoutSFB{}.stride()) == 1 ? ScaleNsPerTile : ScaleKsPerTile;
|
||||
using ScaleCopyTypeB = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementAccumulator)) * LeadingScalesPerTileSFB, 16)>;
|
||||
using SmemScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ScaleCopyTypeB>, ElementAccumulator>;
|
||||
static constexpr int ElementsPerSFBCopy = static_cast<int>(sizeof(ScaleCopyTypeB) / sizeof(ElementAccumulator));
|
||||
|
||||
using TiledCopyScaleA = decltype(make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_1>>{}, Layout<Shape<Int<LeadingScalesPerTileSFA>>>{}));
|
||||
using TiledCopyScaleB = decltype(make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<Int<LeadingScalesPerTileSFB>>>{}));
|
||||
using TiledCopyScaleA = decltype(make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ElementsPerSFACopy>>>{}));
|
||||
using TiledCopyScaleB = decltype(make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ElementsPerSFBCopy>>>{}));
|
||||
|
||||
struct SharedStorage {
|
||||
struct TensorStorage : cute::aligned_struct<128, _0> {
|
||||
|
||||
Reference in New Issue
Block a user