Updates for CUTLASS 3.4.1 (#1346)

* Updates for CUTLASS 3.4.1 * minor epi change
2024-02-15 12:48:34 -08:00
parent 47a3ebbea9
commit bbe579a9e3
49 changed files with 800 additions and 451 deletions
--- a/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
+++ b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
@ -70,7 +70,7 @@

 using namespace cute;

-#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)

 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// GEMM kernel configurations
@ -98,8 +98,8 @@ using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // O
 using TileShape           = Shape<_256,_128,_64>;                           // Threadblock-level tile size
 using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
 using StageCountType = cutlass::gemm::collective::StageCountAuto;           // Stage count maximized based on the tile size
-using KernelSchedule = cutlass::gemm::KernelArrayTmaWarpSpecializedCooperative; // Kernel to launch
-using EpilogueSchedule = cutlass::epilogue::NoSmemWarpSpecializedArray;         // Epilogue to launch
+using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative; // Kernel to launch
+using EpilogueSchedule = cutlass::epilogue::PtrArrayNoSmemWarpSpecialized;         // Epilogue to launch

 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
@ -169,7 +169,7 @@ cutlass::DeviceAllocation<const typename Gemm::ElementC *> ptr_C;
 cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput *> ptr_D;
 cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput *> ptr_ref_D;

-#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)

 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// Testbed utility types
@ -245,7 +245,7 @@ struct Result
  bool passed = false;
 };

-#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)

 /////////////////////////////////////////////////////////////////////////////////////////////////
 /// GEMM setup and evaluation
@ -468,7 +468,7 @@ int run(Options &options)
  return 0;
 }

-#endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+#endif // defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)

 ///////////////////////////////////////////////////////////////////////////////////////////////////

@ -510,7 +510,7 @@ int main(int argc, char const **args) {
  // Evaluate CUTLASS kernels
  //

-#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
+#if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
  run<Gemm>(options);
 #endif

--- a/examples/56_hopper_ptr_array_batched_gemm/CMakeLists.txt
+++ b/examples/56_hopper_ptr_array_batched_gemm/CMakeLists.txt
@ -27,17 +27,17 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-# Note that we set --iterations=0 for all tests below to disable the performance benchmarking.
-# Only the correctness check will be run by these commands.
+set(TEST_SQUARE --m=2048 --n=2048 --k=2048 -l=10 --iterations=1)                   # Square problem sizes
+set(TEST_SQUARE_LARGE_BATCH --m=2048 --n=2048 --k=2048 -l=500 --iterations=1)      # Square problem sizes

-set(TEST_SQUARE --m=2048 --n=2048 --k=2048 -l=10 --iterations=0)                   # Square problem sizes
-set(TEST_SQUARE_LARGE_BATCH --m=2048 --n=2048 --k=2048 -l=500 --iterations=0)      # Square problem sizes
+set(TEST_EPILOGUE --alpha=0.5 --beta=0.7 --iterations=1)                           # Default problem sizes
+set(TEST_EPILOGUE_LARGE_BATCH --alpha=1.5 --beta=2.0 -l=500 --iterations=1)        # Default problem sizes

-set(TEST_EPILOGUE --alpha=0.5 --beta=0.7 --iterations=0)                           # Default problem sizes
-set(TEST_EPILOGUE_LARGE_BATCH --alpha=1.5 --beta=2.0 -l=500 --iterations=0)        # Default problem sizes
+set(TEST_EPILOGUE_OP --beta=0.7 --iterations=1)                                    # Default problem sizes w/ Epilogue Op test
+set(TEST_EPILOGUE_OP_LARGE_BATCH --alpha=1.5 -l=500 --iterations=1)                # Default problem sizes w/ Epilogue Op test

-set(TEST_SMALLK --m=2048 --n=5120 --k=128 --l=5 --iterations=0)                    # Small-k problem sizes
-set(TEST_SMALLK_LARGE_BATCH --m=1024 --n=512 --k=64 --l=500 --iterations=0)        # Small-k problem sizes
+set(TEST_SMALLK --m=2048 --n=5120 --k=128 --l=5 --iterations=1)                    # Small-k problem sizes
+set(TEST_SMALLK_LARGE_BATCH --m=1024 --n=512 --k=64 --l=500 --iterations=1)        # Small-k problem sizes

 cutlass_example_add_executable(
  56_hopper_ptr_array_batched_gemm
@ -47,6 +47,8 @@ cutlass_example_add_executable(
  TEST_SQUARE_LARGE_BATCH
  TEST_EPILOGUE
  TEST_EPILOGUE_LARGE_BATCH
+  TEST_EPILOGUE_OP
+  TEST_EPILOGUE_OP_LARGE_BATCH
  TEST_SMALLK
  TEST_SMALLK_LARGE_BATCH
  )