v4.2 release. (#2587)
* Fix default cluster callback values to 1 to avoid profiler failure when these values are not set in command line. * v4.2 release.
This commit is contained in:
@ -659,7 +659,7 @@ struct Testbed {
|
||||
}
|
||||
|
||||
int64_t flops = int64_t(options.problem_size.m()) * options.problem_size.n() * options.problem_size.k() * 2;
|
||||
int64_t bytes = cutlass::bits_to_bytes(
|
||||
int64_t bytes = cutlass::bits_to_bytes<int64_t>(
|
||||
(cutlass::sizeof_bits<ElementD>::value * 2 + cutlass::sizeof_bits<ElementSoftmax>::value) *
|
||||
options.problem_size.m() * options.problem_size.n());
|
||||
|
||||
|
||||
@ -33,8 +33,8 @@
|
||||
computing reference permutations of 4/5D tensors when source data is column-major.
|
||||
*/
|
||||
#pragma once
|
||||
#include <cuda/std/cassert>
|
||||
#include "cutlass/cutlass.h"
|
||||
#include CUDA_STD_HEADER(cassert)
|
||||
#include "cutlass/layout/pitch_linear.h"
|
||||
#include "cutlass/layout/matrix.h"
|
||||
#include "cutlass/coord.h"
|
||||
|
||||
@ -40,14 +40,12 @@
|
||||
Note that in general the fragment passed to the OutputOp could
|
||||
span multiple rows but it does not happen with the configurations we have
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda/std/cassert>
|
||||
|
||||
#include "cutlass/aligned_buffer.h"
|
||||
#include "cutlass/array.h"
|
||||
#include "cutlass/cutlass.h"
|
||||
#include CUDA_STD_HEADER(cassert)
|
||||
#include "cutlass/functional.h"
|
||||
#include "cutlass/layout/tensor.h"
|
||||
#include "cutlass/layout/vector.h"
|
||||
|
||||
@ -42,12 +42,10 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda/std/cassert>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include CUDA_STD_HEADER(cassert)
|
||||
#include "cutlass/aligned_buffer.h"
|
||||
#include "cutlass/array.h"
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/functional.h"
|
||||
#include "cutlass/layout/tensor.h"
|
||||
#include "cutlass/layout/vector.h"
|
||||
|
||||
@ -38,10 +38,8 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda/std/cassert>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include CUDA_STD_HEADER(cassert)
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/array.h"
|
||||
#include "cutlass/layout/vector.h"
|
||||
|
||||
@ -37,12 +37,10 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cuda/std/cassert>
|
||||
|
||||
#include "cutlass/array.h"
|
||||
#include CUDA_STD_HEADER(cassert)
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/array.h"
|
||||
#include "cutlass/layout/vector.h"
|
||||
#include "cutlass/layout/tensor.h"
|
||||
#include "cutlass/tensor_coord.h"
|
||||
|
||||
@ -132,7 +132,7 @@ constexpr int ScaleGranularityK = 128;
|
||||
constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
|
||||
constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
|
||||
|
||||
using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
|
||||
using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, cute::GMMA::Major::MN, cute::GMMA::Major::K>;
|
||||
|
||||
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); // Layout type for SFA matrix operand
|
||||
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); // Layout type for SFB matrix operand
|
||||
|
||||
@ -142,7 +142,7 @@ static constexpr int ScaleGranularityK = 128;
|
||||
static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
|
||||
static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
|
||||
|
||||
using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
|
||||
using ScaleConfig = cutlass::detail::Sm90BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, cute::GMMA::Major::MN, cute::GMMA::Major::K>;
|
||||
|
||||
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); // Layout type for SFA matrix operand
|
||||
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); // Layout type for SFB matrix operand
|
||||
|
||||
@ -454,11 +454,12 @@ int main(int argc, char const **args) {
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
|
||||
if (props.major != 10 || props.minor != 0) {
|
||||
std::cerr << "This example requires a GPU with compute capability 100a)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// Parse options
|
||||
//
|
||||
|
||||
@ -640,11 +640,11 @@ int main(int argc, char const **args) {
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
|
||||
if (props.major != 10 || props.minor != 0) {
|
||||
std::cerr << "This example requires a GPU with compute capability 100a)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// Parse options
|
||||
|
||||
@ -33,7 +33,7 @@ set(TEST_SWIZZLE_2 --swizzle=2)
|
||||
set(TEST_SWIZZLE_5 --swizzle=5)
|
||||
set(TEST_SWIZZLE_5_UNEVEN --swizzle=5 --m=4096 --n=16384)
|
||||
|
||||
if(NOT CUTLASS_NVCC_ARCHS STREQUAL "100")
|
||||
if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
|
||||
cutlass_example_add_executable(
|
||||
70_blackwell_fp16_gemm
|
||||
70_blackwell_fp16_gemm.cu
|
||||
|
||||
@ -449,9 +449,9 @@ if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MIN
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!(props.major == 10 && props.minor == 0)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
|
||||
|
||||
if (props.major != 10 || props.minor != 0) {
|
||||
std::cerr << "This example requires a GPU with compute capability 100a)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -27,7 +27,7 @@
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
# Both filenames are shorter to avoid MAX_PATH issues on Windows.
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
|
||||
if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
|
||||
cutlass_example_add_executable(
|
||||
71_blackwell_gemm_with_collective_builder
|
||||
71_blackwell_gemm_with_collective_builder.cu
|
||||
|
||||
@ -116,7 +116,7 @@ using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // O
|
||||
|
||||
// Kernel Perf config
|
||||
using MmaTileShape = Shape<_256,_256,_256>; // MMA's tile size
|
||||
using ClusterShape = Shape<_4,_4,_1>; // Shape of the threadblocks in a cluster
|
||||
using ClusterShape = Shape<_2,_4,_1>; // Shape of the threadblocks in a cluster
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
@ -511,10 +511,10 @@ int main(int argc, char const **args) {
|
||||
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
|
||||
if (!(props.major == 10 && props.minor == 0)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
|
||||
if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
|
||||
std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Parse options
|
||||
|
||||
@ -566,8 +566,8 @@ int main(int argc, char const **args) {
|
||||
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
|
||||
if (!(props.major == 10 && props.minor == 0)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
|
||||
if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
|
||||
std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -117,7 +117,7 @@ using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // O
|
||||
|
||||
// Kernel Perf config
|
||||
using MmaTileShape = Shape<_256,_256,_256>; // MMA's tile size
|
||||
using ClusterShape = Shape<_4,_4,_1>; // Shape of the threadblocks in a cluster
|
||||
using ClusterShape = Shape<_2,_4,_1>; // Shape of the threadblocks in a cluster
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
@ -512,8 +512,8 @@ int main(int argc, char const **args) {
|
||||
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
|
||||
if (!(props.major == 10 && props.minor == 0)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
|
||||
if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
|
||||
std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -28,7 +28,7 @@
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
|
||||
if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
|
||||
cutlass_example_add_executable(
|
||||
72a_blackwell_nvfp4_bf16_gemm
|
||||
72a_blackwell_nvfp4_bf16_gemm.cu
|
||||
|
||||
@ -28,7 +28,7 @@
|
||||
|
||||
|
||||
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
|
||||
if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
|
||||
cutlass_example_add_executable(
|
||||
73_blackwell_gemm_preferred_cluster
|
||||
blackwell_gemm_preferred_cluster.cu
|
||||
|
||||
@ -513,7 +513,7 @@ int main(int argc, char const **args) {
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
|
||||
if (props.major != 10 || props.minor != 0) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100)." << std::endl;
|
||||
std::cerr << "This example requires a GPU with compute capability 100a)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -29,9 +29,9 @@
|
||||
|
||||
|
||||
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
|
||||
cutlass_example_add_executable(
|
||||
74_blackwell_gemm_streamk
|
||||
blackwell_gemm_streamk.cu
|
||||
if(CUTLASS_NVCC_ARCHS STREQUAL "100a" OR CUTLASS_NVCC_ARCHS STREQUAL "100f" OR CUTLASS_NVCC_ARCHS STREQUAL "101a" OR CUTLASS_NVCC_ARCHS STREQUAL "101f" OR CUTLASS_NVCC_ARCHS STREQUAL "103a" OR CUTLASS_NVCC_ARCHS STREQUAL "103f")
|
||||
cutlass_example_add_executable(
|
||||
74_blackwell_gemm_streamk
|
||||
blackwell_gemm_streamk.cu
|
||||
)
|
||||
endif()
|
||||
|
||||
@ -556,10 +556,19 @@ int main(int argc, char const **args) {
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
|
||||
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
|
||||
return 0;
|
||||
if (__CUDACC_VER_MAJOR__ < 13) {
|
||||
if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
|
||||
std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ((props.major != 10 || props.major != 11) && props.minor != 0) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 110)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Parse options
|
||||
//
|
||||
|
||||
@ -762,9 +762,8 @@ int main(int argc, char const **args) {
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
if (!(props.major == 10 && props.minor == 0)) {
|
||||
std::cerr
|
||||
<< "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 100a).\n";
|
||||
if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
|
||||
std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -138,8 +138,7 @@ using FusionOperation = cutlass::epilogue::fusion::LinCombEltActBlockScaleFactor
|
||||
|
||||
// Core kernel configurations
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using EpilogueOperatorClass = cutlass::arch::OpClassTensorOp; // Epilogue Operator class tag
|
||||
using MainloopOperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // Mainloop Operator class tag
|
||||
using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // Operator class tag
|
||||
using StageCountType = cutlass::gemm::collective::StageCountAuto; // Stage count maximized based on the tile size
|
||||
|
||||
// Runtime Cluster Shape
|
||||
@ -159,7 +158,7 @@ struct MMA2SMConfig {
|
||||
};
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, EpilogueOperatorClass,
|
||||
ArchTag, OperatorClass,
|
||||
typename MMA1SMConfig::MmaTileShape, ClusterShape,
|
||||
Shape<_128,_64>,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
@ -169,7 +168,7 @@ using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBui
|
||||
// , FusionOperation // Enable for SF Output
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, MainloopOperatorClass,
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
@ -187,7 +186,7 @@ using Gemm1SM = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
using Gemm = Gemm1SM;
|
||||
|
||||
using CollectiveEpilogue2SM = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, EpilogueOperatorClass,
|
||||
ArchTag, OperatorClass,
|
||||
typename MMA2SMConfig::MmaTileShape, ClusterShape,
|
||||
Shape<_128,_64>,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
@ -197,13 +196,13 @@ using CollectiveEpilogue2SM = typename cutlass::epilogue::collective::Collective
|
||||
// , FusionOperation // Enable for SF Output
|
||||
>::CollectiveOp;
|
||||
using CollectiveMainloop2SM = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, MainloopOperatorClass,
|
||||
ArchTag, OperatorClass,
|
||||
ElementA, LayoutA *, AlignmentA,
|
||||
ElementB, LayoutB *, AlignmentB,
|
||||
ElementAccumulator,
|
||||
typename MMA2SMConfig::MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue2SM::SharedStorage))>,
|
||||
typename MMA2SMConfig::KernelSchedule
|
||||
>::CollectiveOp;
|
||||
using GemmKernel2SM = cutlass::gemm::kernel::GemmUniversal<
|
||||
@ -233,7 +232,7 @@ using LayoutSFD = typename Sm1xxBlockScaledOutputConfig::LayoutSF;
|
||||
std::vector<StrideA> stride_A_host;
|
||||
std::vector<StrideB> stride_B_host;
|
||||
std::vector<LayoutSFA> layout_SFA_host;
|
||||
std::vector<LayoutSFA> layout_SFB_host;
|
||||
std::vector<LayoutSFB> layout_SFB_host;
|
||||
std::vector<StrideC> stride_C_host;
|
||||
std::vector<StrideD> stride_D_host;
|
||||
|
||||
@ -897,9 +896,8 @@ int main(int argc, char const **args) {
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
if (!(props.major == 10 && props.minor == 0)) {
|
||||
std::cerr
|
||||
<< "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 100a).\n";
|
||||
if (props.major != 10 || (props.minor != 0 && props.minor != 1 && props.minor != 3)) {
|
||||
std::cerr << "This example requires a GPU with compute capability 100a|f, 101a|f, or 103a|f)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -49,7 +49,7 @@ set(TEST_SMALL_LARGE_GROUP --m=128 --n=128 --groups=50 --iterations=0)
|
||||
set(TEST_RANDOM_PERF --iterations=10) # Random problem sizes
|
||||
set(TEST_RANDOM_PERF_LARGE_GROUP --groups=50 --iterations=10) # Random problem sizes
|
||||
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
|
||||
if(CUTLASS_NVCC_ARCHS STREQUAL "100a")
|
||||
cutlass_example_add_executable(
|
||||
75_blackwell_grouped_gemm
|
||||
75_blackwell_grouped_gemm.cu
|
||||
|
||||
@ -504,10 +504,19 @@ int main(int argc, char const **args) {
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
|
||||
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
|
||||
return 0;
|
||||
if (__CUDACC_VER_MAJOR__ < 13) {
|
||||
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ((props.major != 10 || props.major != 11) && props.minor != 0) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 110)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Parse options
|
||||
//
|
||||
|
||||
@ -504,10 +504,19 @@ int main(int argc, char const **args) {
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
|
||||
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
|
||||
return 0;
|
||||
if (__CUDACC_VER_MAJOR__ < 13) {
|
||||
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ((props.major != 10 || props.major != 11) && props.minor != 0) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 110)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Parse options
|
||||
//
|
||||
|
||||
@ -500,10 +500,19 @@ int main(int argc, char const **args) {
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
|
||||
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
if (__CUDACC_VER_MAJOR__ < 13) {
|
||||
if (props.major != 10 && (props.minor != 0 || props.minor != 1)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 101)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ((props.major != 10 || props.major != 11) && props.minor != 0) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 100 or 110)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Parse options
|
||||
//
|
||||
|
||||
@ -163,7 +163,7 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
|
||||
77_blackwell_mla.cu
|
||||
TEST_COMMAND_OPTIONS
|
||||
TEST_MLA_BASIC
|
||||
TEST_MLA_SEP_REDUCTION
|
||||
TEST_MLA_SEP_REDUCTION
|
||||
TEST_MLA_FUSE_REDUCTION
|
||||
)
|
||||
target_include_directories(77_blackwell_mla_2sm_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
@ -175,8 +175,8 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
|
||||
77_blackwell_mla.cu
|
||||
TEST_COMMAND_OPTIONS
|
||||
TEST_MLA_BASIC
|
||||
TEST_MLA_SEP_REDUCTION
|
||||
TEST_MLA_FUSE_REDUCTION
|
||||
TEST_MLA_SEP_REDUCTION
|
||||
TEST_MLA_FUSE_REDUCTION
|
||||
)
|
||||
target_include_directories(77_blackwell_mla_2sm_cpasync_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_compile_definitions(77_blackwell_mla_2sm_cpasync_${PREC} PRIVATE ${PREC_MACRO} CPASYNC)
|
||||
|
||||
@ -100,7 +100,7 @@ public:
|
||||
cutlass::fmha::kernel::FmhaKernelBwdConvert<ProblemShape, Element, ElementAccumulator>
|
||||
>;
|
||||
|
||||
using OperationMha= cutlass::fmha::device::FMHA<
|
||||
using OperationNormal= cutlass::fmha::device::FMHA<
|
||||
cutlass::fmha::kernel::Sm100FmhaBwdKernelTmaWarpSpecialized<
|
||||
ProblemShape, Element, ElementAccumulator, TileShape, Mask
|
||||
>
|
||||
@ -112,7 +112,7 @@ public:
|
||||
>
|
||||
>;
|
||||
|
||||
using Operation = std::conditional_t<IsMla, OperationMla, OperationMha>;
|
||||
using Operation = std::conditional_t<IsMla, OperationMla, OperationNormal>;
|
||||
|
||||
using Kernel = typename Operation::Kernel;
|
||||
|
||||
|
||||
@ -365,7 +365,7 @@ struct Sm100FmhaGenKernelWarpspecialized {
|
||||
pipeline_corr_epi_params.role = CollectiveMainloop::PipelineE::ThreadCategory::Consumer;
|
||||
}
|
||||
pipeline_corr_epi_params.producer_arv_count = NumWarpsCorrection * cutlass::NumThreadsPerWarp;
|
||||
pipeline_corr_epi_params.consumer_arv_count = NumWarpsEpilogue * cutlass::NumThreadsPerWarp;
|
||||
pipeline_corr_epi_params.consumer_arv_count = cute::max(1, NumWarpsEpilogue * cutlass::NumThreadsPerWarp);
|
||||
typename CollectiveMainloop::PipelineE pipeline_corr_epi(
|
||||
shared_storage.pipelines.corr_epi,
|
||||
pipeline_corr_epi_params,
|
||||
|
||||
@ -117,7 +117,7 @@ using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBui
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
cutlass::epilogue::FastF32NoSmemWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
// Build the mainloop
|
||||
|
||||
@ -88,7 +88,7 @@
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -189,7 +189,7 @@ cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout> block_C;
|
||||
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_D;
|
||||
// Reference Output Tensor
|
||||
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
template <typename T>
|
||||
auto make_iterator(T* ptr) {
|
||||
@ -283,7 +283,7 @@ struct Result
|
||||
|
||||
};
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM setup and evaluation
|
||||
@ -489,19 +489,28 @@ int run(Options &options)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
|
||||
// and must have compute capability at least 100.
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
|
||||
// or CUDA 12.9 or higher for SM121 support.
|
||||
// Must have compute capability at least 120.
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
|
||||
std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
|
||||
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
|
||||
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaDeviceProp props;
|
||||
int current_device_id;
|
||||
@ -509,8 +518,8 @@ int main(int argc, char const **args) {
|
||||
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
|
||||
if (!(props.major == 12 && props.minor == 0)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
|
||||
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -530,9 +539,9 @@ int main(int argc, char const **args) {
|
||||
//
|
||||
// Evaluate CUTLASS kernels
|
||||
//
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
run<Gemm>(options);
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -86,7 +86,7 @@
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -217,7 +217,7 @@ cutlass::HostTensor<ElementSFD, cutlass::layout::PackedVectorLayout> block_refer
|
||||
// Matrix-wide normalization constant
|
||||
cutlass::HostTensor<ElementCompute, cutlass::layout::PackedVectorLayout> block_Normconst;
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
template <typename T>
|
||||
auto make_iterator(T* ptr) {
|
||||
@ -311,7 +311,7 @@ struct Result
|
||||
|
||||
};
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM setup and evaluation
|
||||
@ -536,19 +536,28 @@ int run(Options &options)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
|
||||
// and must have compute capability at least 100.
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
|
||||
// or CUDA 12.9 or higher for SM121 support.
|
||||
// Must have compute capability at least 120.
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
|
||||
std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
|
||||
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
|
||||
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaDeviceProp props;
|
||||
int current_device_id;
|
||||
@ -556,8 +565,8 @@ int main(int argc, char const **args) {
|
||||
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
|
||||
if (!(props.major == 12 && props.minor == 0)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
|
||||
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -577,9 +586,9 @@ int main(int argc, char const **args) {
|
||||
//
|
||||
// Evaluate CUTLASS kernels
|
||||
//
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
run<Gemm>(options);
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -88,7 +88,7 @@
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -189,7 +189,7 @@ cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout> block_C;
|
||||
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_D;
|
||||
// Reference Output Tensor
|
||||
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
template <typename T>
|
||||
auto make_iterator(T* ptr) {
|
||||
@ -283,7 +283,7 @@ struct Result
|
||||
|
||||
};
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM setup and evaluation
|
||||
@ -489,19 +489,28 @@ int run(Options &options)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
|
||||
// and must have compute capability at least 100.
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
|
||||
// or CUDA 12.9 or higher for SM121 support.
|
||||
// Must have compute capability at least 120.
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
|
||||
std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
|
||||
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
|
||||
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaDeviceProp props;
|
||||
int current_device_id;
|
||||
@ -509,8 +518,8 @@ int main(int argc, char const **args) {
|
||||
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
|
||||
if (!(props.major == 12 && props.minor == 0)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
|
||||
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -530,9 +539,9 @@ int main(int argc, char const **args) {
|
||||
//
|
||||
// Evaluate CUTLASS kernels
|
||||
//
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
run<Gemm>(options);
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -97,7 +97,7 @@ using namespace cute;
|
||||
using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int,int,int>>; // <M,N,K> per group
|
||||
using ElementInput = cutlass::float_e2m1_t; // Element type for Input matrix operands
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM kernel configurations
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -263,7 +263,7 @@ cutlass::DeviceAllocation<ElementAccumulator> block_beta;
|
||||
// NormConst is a single device-side constant value, its not per-batch or per-group
|
||||
cutlass::DeviceAllocation<ElementAccumulator> norm_constant_device;
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
template <typename T>
|
||||
auto make_iterator(T* ptr) {
|
||||
@ -466,7 +466,7 @@ struct Result
|
||||
bool passed = false;
|
||||
};
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM setup and evaluation
|
||||
@ -861,30 +861,39 @@ int run(Options &options, bool host_problem_shapes_available = true)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
// CUTLASS must be compiled with CUDA 12.8 Toolkit to run this example
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
|
||||
// or CUDA 12.9 or higher for SM121 support.
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 ||
|
||||
((__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)
|
||||
)
|
||||
) {
|
||||
std::cerr << "This example requires CUDA 12.8 or newer.\n";
|
||||
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support.\n";
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
|
||||
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support.\n";
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaDeviceProp props;
|
||||
int current_device_id;
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
if (!(props.major == 12 && props.minor == 0)) {
|
||||
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
|
||||
std::cerr
|
||||
<< "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 120a).\n";
|
||||
<< "This example requires a GPU of NVIDIA's Blackwell Architecture (compute capability 120 or 121).\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -901,7 +910,7 @@ int main(int argc, char const **args) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
allocate(options);
|
||||
initialize(options);
|
||||
|
||||
|
||||
@ -46,7 +46,7 @@ set(TEST_SMALL_LARGE_GROUP --m=128 --n=128 --groups=50 --iterations=0)
|
||||
set(TEST_RANDOM_PERF --iterations=10) # Random problem sizes
|
||||
set(TEST_RANDOM_PERF_LARGE_GROUP --groups=50 --iterations=10) # Random problem sizes
|
||||
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES 120a)
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES "120a|121a")
|
||||
cutlass_example_add_executable(
|
||||
79a_blackwell_geforce_nvfp4_bf16_gemm
|
||||
79a_blackwell_geforce_nvfp4_bf16_gemm.cu
|
||||
|
||||
@ -78,7 +78,7 @@
|
||||
|
||||
#include "helper.h"
|
||||
using namespace cute;
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM kernel configurations
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -248,7 +248,7 @@ struct Result
|
||||
avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
|
||||
{}
|
||||
};
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM setup and evaluation
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -507,25 +507,34 @@ int run(Options &options)
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
|
||||
// and must have compute capability at least 120.
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
|
||||
// or CUDA 12.9 or higher for SM121 support.
|
||||
// Must have compute capability at least 120.
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
|
||||
std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
|
||||
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
|
||||
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
cudaDeviceProp props;
|
||||
int current_device_id;
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
|
||||
if (!(props.major == 12 && props.minor == 0)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
|
||||
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
//
|
||||
@ -540,9 +549,9 @@ int main(int argc, char const **args) {
|
||||
//
|
||||
// Evaluate CUTLASS kernels
|
||||
//
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
run<Gemm>(options);
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
return 0;
|
||||
}
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -78,7 +78,7 @@
|
||||
|
||||
#include "helper.h"
|
||||
using namespace cute;
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM kernel configurations
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -183,7 +183,7 @@ cutlass::HostTensor<outputScaleFactor, cutlass::layout::PackedVectorLayout> bloc
|
||||
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
|
||||
cutlass::HostTensor<outputScaleFactor, cutlass::layout::PackedVectorLayout> block_reference_SFD;
|
||||
cutlass::HostTensor<ElementCompute, cutlass::layout::PackedVectorLayout> block_Normconst;
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
template <typename T>
|
||||
auto make_iterator(T* ptr) {
|
||||
return cute::recast_ptr<T>(ptr);
|
||||
@ -259,7 +259,7 @@ struct Result
|
||||
avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
|
||||
{}
|
||||
};
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM setup and evaluation
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -531,25 +531,34 @@ int run(Options &options)
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit to run this example
|
||||
// and must have compute capability at least 120.
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
|
||||
// or CUDA 12.9 or higher for SM121 support.
|
||||
// Must have compute capability at least 120.
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
|
||||
std::cerr << "This example requires CUDA 12.8 or newer." << std::endl;
|
||||
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
|
||||
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
cudaDeviceProp props;
|
||||
int current_device_id;
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
|
||||
if (!(props.major == 12 && props.minor == 0)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120)." << std::endl;
|
||||
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 120 or 121)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
//
|
||||
@ -564,9 +573,9 @@ int main(int argc, char const **args) {
|
||||
//
|
||||
// Evaluate CUTLASS kernels
|
||||
//
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
run<Gemm>(options);
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
return 0;
|
||||
}
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -27,7 +27,7 @@
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES 120a)
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES "120a|121a")
|
||||
cutlass_example_add_executable(
|
||||
80a_blackwell_geforce_mxfp8_bf16_sparse_gemm
|
||||
80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu
|
||||
|
||||
104
examples/81_blackwell_gemm_blockwise/README.md
Normal file
104
examples/81_blackwell_gemm_blockwise/README.md
Normal file
@ -0,0 +1,104 @@
|
||||
# Blockwise and Groupwise GEMM and Grouped GEMM on Blackwell
|
||||
|
||||
Blockwise and Groupwise GEMM and Grouped GEMM implement software scaling by the accumulator type.
|
||||
The examples in this directory aim to demonstrate how we can instantiate this kernel and run it.
|
||||
The profiler enables instantiating and profiling different kernel configurations for Blockwise and Groupwise GEMM
|
||||
to determine the best performing kernel for your workload.
|
||||
|
||||
## Introduction
|
||||
Blockwise and Groupwise GEMM operations enable fine-grained numerical precision control by applying scale factors at configurable granularities. This is particularly useful for quantized neural networks where different regions of tensors may have different scaling requirements.
|
||||
|
||||
For a GEMM $D = \alpha A B + \beta C$, we introduce two scale factor tensors, SFA
|
||||
and SFB. This leads to a GEMM $D = \alpha \text{SFA} * A \text{ SFB} * B + \beta C$.
|
||||
|
||||
## Scale Factor Tensors
|
||||
- *SFA*: Broadcast the same scale within a block defined by _scale granularity m_ and _scale granularity k_ when scaling A.
|
||||
- Scale granularity m and scale granularity k are also referred to as _scale vector m_ and _k_ respectively.
|
||||
- *SFB*: Broadcast the same scale within a block defined by _scale granularity n_ and _scale granularity k_ when scaling B.
|
||||
- Scale granularity n and scale granularity k are also referred to as _scale vector n_ and _k_ respectively.
|
||||
|
||||
These can be represented in CuTe as:
|
||||
- *SFA Layout*: $((\text{scale granularity M}, M / \text{scale granularity M}), (\text{scale granularity K}, K / \text{scale granularity K})) : ((0, int), (0, int))$
|
||||
- *SFB Layout*: $((\text{scale granularity N}, M / \text{scale granularity M}), (\text{scale granularity K}, K / \text{scale granularity K})) : ((0, int), (0, int))$
|
||||
|
||||
The 0 element stride ensures the same group of coordinates to map to the same element in the scale factors.
|
||||
|
||||
## Configuration
|
||||
|
||||
For convenience the Blockwise and Groupwise implementation provide
|
||||
`cutlass::detail::Sm100BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>`
|
||||
to deduce layouts and manage compact tensors.
|
||||
|
||||
`cutlass::detail::Sm100BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>` by default makes
|
||||
every tensor major the M/N mode, but can be configured. For example:
|
||||
`cutlass::detail::Sm100BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, UMMA::Major::K, UMMA::Major::MN>`
|
||||
denotes SFA will be major in the K dimension but SFB will be major in the N dimension.
|
||||
|
||||
## Integration with Other Frameworks
|
||||
|
||||
If translating from frameworks like Torch where SFA has shape
|
||||
(M / ScaleGranularityM, K / ScaleGranularityK) and SFB has a shape (K / ScaleGranularityK, N / ScaleGranularityN),
|
||||
ensure to transpose SFB and B to fit into the canonical CuTe layout form. This ensures K is always the second mode.
|
||||
Use strides can be used to determine if each tensor is MN or K major to correctly form the layouts either directly
|
||||
or with the convenience wrappers.
|
||||
|
||||
|
||||
## Kernel Selection and Profiling
|
||||
|
||||
To determine the most performance Blockwise/Groupwise GEMM or Grouped GEMM kernel for your use case, you can utilize the
|
||||
[CUTLASS profiler](../../media/docs/cpp/profiler.md).
|
||||
|
||||
All Blockwise/Groupwise GEMMs and Group GEMMs with `f32` scaling of `e4m3` or runtime `f8` types can be selected by
|
||||
selecting a subset of kernels when configuring with CMake by passing:
|
||||
`-DCUTLASS_LIBRARY_KERNELS="cutlass3x*f32xe4m3_*f32xe4m3*,cutlass3x*f32xf8_*f32xf8*"`.
|
||||
|
||||
The simplest way to use the profiler is to pass `m`, `n`, and `k` as well as your `scale_vec_size_m`,
|
||||
`scale_vec_size_n`, and `scale_vec_size_k`. Passing `enable-best-kernel-for-fixed-shape` will do some autotuning
|
||||
per kernel to determine best rasterization orders, swizzles, and cluster sizes. Passing `blockwiseGemm`
|
||||
or `GroupedGemm` through the operation flag will determine which set of operations will be profiled.
|
||||
|
||||
For examle, this command using the cutlass profiler will dump the performance of all compiled kernels which support scale
|
||||
granularity m = 1, scale granularity n = 128, and scale granularity k = 128 for the problem size 8192x8192x8192:
|
||||
```
|
||||
cutlass_profiler --operation=blockwiseGemm \
|
||||
--enable-best-kernel-for-fixed-shape \
|
||||
--m=8192 --n=8192 --k=8192 \
|
||||
--scale_vec_size_m=1 --scale_vec_size_n=128 --scale_vec_size_k=128 \
|
||||
--verification-enabled=false
|
||||
```
|
||||
|
||||
### Kernel Naming Convention
|
||||
|
||||
The naming of the blockwise and groupwise kernels includes the following new pattern: for each tensor scalar pair we have
|
||||
`<scale_granularity_m or scale_granularity_n>x<scale_granularity_k><accumulator type>x<scaled tensor type>`. For example
|
||||
`cutlass3x_sm100_tensorop_gemm_64x128f32xe4m3_1x128f32xe4m3_f32_f16_f16_64x128x128_1x1x1_0_nnn_align16_1sm` would denote:
|
||||
- A CUTLASS 3 GEMM for SM100 that uses tensor cores.
|
||||
- SFA is f32 with a 64 element scale granularity m and a 128 element scale granularity k.
|
||||
- The A matrix is e4m3.
|
||||
- SFB is f32 with a 1 element scale granularity n and a 128 element scale granularity k.
|
||||
- The B matrix is e4m3.
|
||||
- The epilogue is done in f32.
|
||||
- The C matrix is f16.
|
||||
- The D matrix is f16.
|
||||
- The MMA tile shape is 64x128x128.
|
||||
- The cluster shape is 1x1x1.
|
||||
- A, B, C, and D are all column major.
|
||||
- The alignment of the major modes are 16 elements for A, B, C, and D.
|
||||
- The MMA variant is a 1SM instruction.
|
||||
|
||||
It is also worthwhile to note that C can be void if scaling by beta is not needed.
|
||||
|
||||
## Performance Tips and Tricks
|
||||
|
||||
- *MMA Dimensions*: in both Blackwell and Hopper tensor cores it is worthwhile to note that the smallest `MMA_M` dimension is 64, but `MMA_N`
|
||||
dimension can be as small as 8 for some instructions. For problem sizes where M is small consider computing $D^T = \alpha B^T A^T + \beta C^T$ instead.
|
||||
- When computing after swapping A and B and transposing the N dimension is now our small dimension. With a small `MMA_N` we can more effectively tile without performing unecessary computation.
|
||||
- *Layout Swapping*: When optimizing with the profiler swap `m` and `n` inputs and adjust layouts to reflect this swapping and transposing.
|
||||
- For example if we have a row-major A, column-major B, and row-major D, we can swap tensors and run a kernel with:
|
||||
- The left hand matrix as row-major (since B transposed is row-major)
|
||||
- A right hand matrix as column-major (since A transposed is column-major)
|
||||
- A column-major output (since D transposed is column-major).
|
||||
|
||||
When using blockwise and groupwise GEMM we must swap the scale vector sizes when doing this optimization. If we have a 1 element scale granularity M
|
||||
and a 128 element scale granularity N, we must run a kernel with a 128 element scale granularity M and a 1 element scale granularity
|
||||
N.
|
||||
@ -0,0 +1,495 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/tensor_ref.h"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
#include "cutlass/util/distribution.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/packed_stride.hpp"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
|
||||
#include "cutlass/util/reference/device/gemm.h"
|
||||
#include "cutlass/util/reference/device/tensor_fill.h"
|
||||
#include "cutlass/util/reference/device/tensor_compare.h"
|
||||
|
||||
#include "cutlass/util/mixed_dtype_utils.hpp"
|
||||
#include "cutlass/detail/collective/mixed_input_utils.hpp"
|
||||
|
||||
#include "helper.h"
|
||||
|
||||
#include "mixed_dtype_helper.cuh"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM kernel configurations
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
using MmaType = cutlass::bfloat16_t;
|
||||
using QuantType = cutlass::int4b_t;
|
||||
using AccumulatorType = float;
|
||||
|
||||
// A matrix configuration
|
||||
using ElementA = MmaType; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// B matrix configuration
|
||||
using ElementB = QuantType; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// This example manually swaps and transposes, so keep transpose of input layouts
|
||||
using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
|
||||
using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
|
||||
|
||||
using ElementZero = MmaType;
|
||||
using ElementScale = MmaType;
|
||||
|
||||
// C/D matrix configuration
|
||||
using ElementC = cutlass::bfloat16_t; // Element type for C and D matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C and D matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// D matrix configuration
|
||||
using ElementD = cutlass::bfloat16_t; // Element type for C and D matrix operands
|
||||
using LayoutD = cutlass::layout::RowMajor;
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
|
||||
|
||||
// Core kernel configurations
|
||||
using ElementAccumulator = AccumulatorType; // Element type for internal accumulation
|
||||
using ElementCompute = AccumulatorType; // Element type for epilogue computation
|
||||
using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag
|
||||
using MmaTileShape = Shape<_256,_128,_128>; // (MmaTileShape_N, MmaTileShape_M, MmaTileShape_K) as A and B will be swapped
|
||||
using ClusterShape = Shape<_2,_1,_1>; // Shape of the threadblocks in a cluster
|
||||
using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized2SmMixedInputSm100; // Kernel to launch based on the default setting in the Collective Builder
|
||||
using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized2Sm;
|
||||
using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
|
||||
|
||||
constexpr int ScaleGranularityN = 1; //Should be less than or equal to GEMM_N
|
||||
constexpr int ScaleGranularityK = 128; //Should be less than or equal to GEMM_K
|
||||
using ScaleConfig = cutlass::detail::Sm100MixedInputBlockwiseScaleConfig<ScaleGranularityN, ScaleGranularityK>;
|
||||
using LayoutScale = decltype(ScaleConfig::deduce_layout_scale()); // Layout type for SFA matrix operand
|
||||
LayoutScale layout_S;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
MmaTileShape, ClusterShape,
|
||||
EpilogueTileType,
|
||||
ElementAccumulator, ElementCompute,
|
||||
// Transpose layout of D here since we use explicit swap + transpose
|
||||
// the void type for C tells the builder to allocate 0 smem for the C matrix.
|
||||
// We can enable this if beta == 0 by changing ElementC to void below.
|
||||
ElementC, typename cutlass::layout::LayoutTranspose<LayoutC>::type, AlignmentC,
|
||||
ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type, AlignmentD,
|
||||
EpilogueSchedule // This is the only epi supporting the required swap + transpose.
|
||||
>::CollectiveOp;
|
||||
|
||||
// ============================================================ MIXED INPUT NO SCALES ============================================================================
|
||||
//The collective will infer that the narrow type should be upcasted to the wide type.
|
||||
//We swap A and B operands to the builder here
|
||||
using CollectiveMainloopConvertOnly = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
cute::tuple<ElementB>, LayoutB_Transpose, AlignmentB,
|
||||
ElementA, LayoutA_Transpose, AlignmentA,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
|
||||
>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernelConvertOnly = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>, // Indicates ProblemShape
|
||||
CollectiveMainloopConvertOnly,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using GemmConvertOnly = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelConvertOnly>;
|
||||
|
||||
// =========================================================== MIXED INPUT WITH SCALES ===========================================================================
|
||||
// The Scale information must get paired with the operand that will be scaled. In this example, B is scaled so we make a tuple of B's information and the scale information.
|
||||
using CollectiveMainloopScaleOnly = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
cute::tuple<ElementB, ElementScale>, cute::tuple<LayoutB_Transpose, LayoutScale>, AlignmentB,
|
||||
ElementA, LayoutA_Transpose, AlignmentA,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
|
||||
>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernelScaleOnly = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>, // Indicates ProblemShape
|
||||
CollectiveMainloopScaleOnly,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using GemmScaleOnly = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleOnly>;
|
||||
|
||||
// =========================================================== MIXED INPUT WITH SCALES AND ZEROS ==================================================================
|
||||
// We specify scale + zero elements to indicate that we require both. Scales and biases have the same format.
|
||||
using CollectiveMainloopScaleWithZeroPoint = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
cute::tuple<ElementB, ElementScale, ElementZero>, cute::tuple<LayoutB_Transpose, LayoutScale>, AlignmentB,
|
||||
ElementA, LayoutA_Transpose, AlignmentA,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
|
||||
>,
|
||||
MainloopSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernelScaleWithZeroPoint = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>, // Indicates ProblemShape
|
||||
CollectiveMainloopScaleWithZeroPoint,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using GemmScaleWithZeroPoint = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelScaleWithZeroPoint>;
|
||||
// =================================================================================================================================================================
|
||||
|
||||
using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
|
||||
using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
|
||||
|
||||
using StrideC = typename GemmKernelScaleOnly::StrideC;
|
||||
using StrideD = typename GemmKernelScaleOnly::StrideD;
|
||||
using StrideC_ref = cutlass::detail::TagToStrideC_t<LayoutC>;
|
||||
using StrideD_ref = cutlass::detail::TagToStrideC_t<LayoutD>;
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// Initialization
|
||||
StrideA stride_A;
|
||||
StrideB stride_B;
|
||||
StrideC stride_C;
|
||||
StrideC_ref stride_C_ref;
|
||||
StrideD stride_D;
|
||||
StrideD_ref stride_D_ref;
|
||||
uint64_t seed;
|
||||
|
||||
// Scale and Zero share a stride since the layout and shapes must be the same.
|
||||
using StrideS = typename cute::Stride<cute::Int<1>, int64_t, int64_t>;
|
||||
using StrideS_ref = cutlass::detail::TagToStrideB_t<LayoutScale>;
|
||||
StrideS stride_S;
|
||||
StrideS_ref stride_S_ref;
|
||||
|
||||
cutlass::DeviceAllocation<ElementA> block_A;
|
||||
cutlass::DeviceAllocation<ElementB> block_B;
|
||||
cutlass::DeviceAllocation<MmaType> block_B_dq;
|
||||
cutlass::DeviceAllocation<ElementScale> block_scale;
|
||||
cutlass::DeviceAllocation<ElementZero> block_zero;
|
||||
cutlass::DeviceAllocation<ElementC> block_C;
|
||||
cutlass::DeviceAllocation<typename GemmScaleOnly::EpilogueOutputOp::ElementOutput> block_D;
|
||||
cutlass::DeviceAllocation<typename GemmScaleOnly::EpilogueOutputOp::ElementOutput> block_ref_D;
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Testbed utility types
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM setup and evaluation
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Initialize operands to be used in the GEMM and reference GEMM
|
||||
void initialize(MixedDtypeOptions const& options) {
|
||||
|
||||
auto shape_b = cute::make_shape(options.n, options.k, options.l);
|
||||
stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
|
||||
stride_B = cutlass::make_cute_packed_stride(StrideB{}, shape_b);
|
||||
// Reverse stride here due to swap and transpose
|
||||
stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.n, options.m, options.l));
|
||||
stride_C_ref = cutlass::make_cute_packed_stride(StrideC_ref{}, cute::make_shape(options.m, options.n, options.l));
|
||||
// Reverse stride here due to swap and transpose
|
||||
stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.n, options.m, options.l));
|
||||
stride_D_ref = cutlass::make_cute_packed_stride(StrideD_ref{}, cute::make_shape(options.m, options.n, options.l));
|
||||
|
||||
layout_S = ScaleConfig::tile_atom_to_shape_scale(make_shape(options.n, options.k, options.l));
|
||||
|
||||
auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
|
||||
auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
|
||||
auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
|
||||
auto blockscale_b_coord = cutlass::make_Coord(size(filter_zeros(layout_S)));
|
||||
|
||||
block_A.reset(a_coord.product());
|
||||
block_B.reset(b_coord.product());
|
||||
block_B_dq.reset(b_coord.product());
|
||||
block_C.reset(c_coord.product());
|
||||
block_D.reset(c_coord.product());
|
||||
block_ref_D.reset(c_coord.product());
|
||||
|
||||
block_scale.reset(blockscale_b_coord.product());
|
||||
block_zero.reset(blockscale_b_coord.product());
|
||||
|
||||
initialize_tensor(block_A, seed + 2022);
|
||||
initialize_quant_tensor(block_B, seed + 2021);
|
||||
initialize_tensor(block_C, seed + 2020);
|
||||
initialize_scale<QuantType, ElementScale>(block_scale, options);
|
||||
initialize_zero(block_zero, options);
|
||||
|
||||
if(options.verify){
|
||||
auto layout_B = make_layout(shape_b, stride_B);
|
||||
auto scale_stride = layout_S.stride();
|
||||
auto layout_scale_zero = make_layout(
|
||||
make_shape(size<0>(layout_S), size<1,1>(layout_S), size<2>(layout_S)),
|
||||
make_stride(size<0,1>(scale_stride), size<1,1>(scale_stride), size<2>(scale_stride))
|
||||
); //layout = (options.n, scale_k, options.l) : (_1, options.n, _0)
|
||||
cudaStream_t stream = cudaStreamDefault;
|
||||
cutlass::dequantize(block_B_dq.get(), block_B.get(), layout_B, block_scale.get(), block_zero.get(), layout_scale_zero, ScaleGranularityK, stream);
|
||||
}
|
||||
}
|
||||
|
||||
/// Populates a Gemm::Arguments structure from the given commandline options
|
||||
template <class Args, cutlass::detail::ConversionMode KernelConversionMode>
|
||||
Args args_from_options(MixedDtypeOptions const& options)
|
||||
{
|
||||
// Swap the A and B tensors, as well as problem shapes here.
|
||||
if constexpr (KernelConversionMode == cutlass::detail::ConversionMode::DirectConvert) {
|
||||
return Args {
|
||||
cutlass::gemm::GemmUniversalMode::kGemm,
|
||||
{options.n, options.m, options.k, options.l},
|
||||
{block_B.get(), stride_B, block_A.get(), stride_A},
|
||||
{{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
|
||||
};
|
||||
}
|
||||
else if constexpr(KernelConversionMode == cutlass::detail::ConversionMode::ConvertAndScale) {
|
||||
return Args {
|
||||
cutlass::gemm::GemmUniversalMode::kGemm,
|
||||
{options.n, options.m, options.k, options.l},
|
||||
{block_B.get(), stride_B, block_A.get(), stride_A, block_scale.get(), layout_S},
|
||||
{{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
|
||||
};
|
||||
}
|
||||
else if constexpr(KernelConversionMode == cutlass::detail::ConversionMode::ConvertAndScaleWithZero) {
|
||||
return Args {
|
||||
cutlass::gemm::GemmUniversalMode::kGemm,
|
||||
{options.n, options.m, options.k, options.l},
|
||||
{block_B.get(), stride_B, block_A.get(), stride_A, block_scale.get(), layout_S, block_zero.get()},
|
||||
{{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
|
||||
};
|
||||
} else {
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
bool verify(MixedDtypeOptions const& options) {
|
||||
//
|
||||
// Compute reference output
|
||||
//
|
||||
|
||||
using CollectiveMainloopRef = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
MmaType, LayoutA, AlignmentA,
|
||||
MmaType, LayoutB, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveEpilogueRef = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutD, AlignmentD,
|
||||
EpilogueSchedule
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernelRef = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>, // Indicates ProblemShape
|
||||
CollectiveMainloopRef,
|
||||
CollectiveEpilogueRef
|
||||
>;
|
||||
|
||||
using GemmRef = cutlass::gemm::device::GemmUniversalAdapter<GemmKernelRef>;
|
||||
|
||||
typename GemmRef::Arguments arguments{
|
||||
cutlass::gemm::GemmUniversalMode::kGemm,
|
||||
{options.m, options.n, options.k, options.l},
|
||||
{block_A.get(), stride_A, block_B_dq.get(), stride_B},
|
||||
{{options.alpha, options.beta}, block_C.get(), stride_C_ref, block_ref_D.get(), stride_D_ref}
|
||||
};
|
||||
|
||||
// Run the gemm where the scaling is performed outside of the kernel.
|
||||
GemmRef gemm_ref;
|
||||
size_t workspace_size = GemmRef::get_workspace_size(arguments);
|
||||
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
|
||||
CUTLASS_CHECK(gemm_ref.can_implement(arguments));
|
||||
CUTLASS_CHECK(gemm_ref.initialize(arguments, workspace.get()));
|
||||
CUTLASS_CHECK(gemm_ref.run());
|
||||
|
||||
// compare_reference
|
||||
ElementD const epsilon(1e-2f);
|
||||
ElementD const non_zero_floor(1e-2f);
|
||||
bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(block_ref_D.get(), block_D.get(), block_D.size(), epsilon, non_zero_floor);
|
||||
return passed;
|
||||
}
|
||||
|
||||
/// Execute a given example GEMM computation
|
||||
template <typename Gemm>
|
||||
int run(MixedDtypeOptions &options)
|
||||
{
|
||||
initialize(options);
|
||||
|
||||
// Instantiate CUTLASS kernel depending on templates
|
||||
Gemm gemm;
|
||||
|
||||
// Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
|
||||
auto arguments = args_from_options<typename Gemm::Arguments, Gemm::CollectiveMainloop::KernelConversionMode>(options);
|
||||
|
||||
// Using the arguments, query for extra workspace required for matrix multiplication computation
|
||||
size_t workspace_size = Gemm::get_workspace_size(arguments);
|
||||
|
||||
// Allocate workspace memory
|
||||
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
|
||||
|
||||
// Check if the problem size is supported or not
|
||||
CUTLASS_CHECK(gemm.can_implement(arguments));
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
|
||||
|
||||
// Correctness / Warmup iteration
|
||||
CUTLASS_CHECK(gemm.run());
|
||||
|
||||
// Check if output from CUTLASS kernel and reference kernel are equal or not
|
||||
MixedDtypeResult result;
|
||||
if(options.verify){
|
||||
result.passed = verify(options);
|
||||
std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
|
||||
}
|
||||
else{
|
||||
result.passed = true;
|
||||
std::cout << " Verification: Off " << std::endl;
|
||||
}
|
||||
if (!result.passed) {
|
||||
exit(-1);
|
||||
}
|
||||
mixed_dtype_profiling(gemm, options, result);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
// CUTLASS must be compiled with CUDA 12.8 Toolkit to run this example
|
||||
// and must have compute capability at least 100a.
|
||||
bool is_correct_cuda_version = (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 8);
|
||||
if (!is_correct_cuda_version) {
|
||||
std::cerr << "Version is " << __CUDACC_VER_MINOR__ << "\n";
|
||||
std::cerr << "This example requires CUDA 12.8 or newer.\n";
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
|
||||
cudaDeviceProp props;
|
||||
int current_device_id;
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
if (props.major != 10 || props.minor != 0) {
|
||||
std::cerr
|
||||
<< "This example requires a GPU of NVIDIA's Blackwell Architecture or "
|
||||
<< "later (compute capability 100a or greater).\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Parse options
|
||||
//
|
||||
|
||||
MixedDtypeOptions options;
|
||||
|
||||
options.parse(argc, args);
|
||||
|
||||
if (options.help) {
|
||||
options.print_usage(std::cout) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Evaluate CUTLASS kernels
|
||||
//
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
if (options.mode == MixedDtypeGemmMode::ConvertOnly) {
|
||||
std::cout << "Running in conversion only mode." << std::endl;
|
||||
run<GemmConvertOnly>(options);
|
||||
}
|
||||
else if (options.mode == MixedDtypeGemmMode::ScaleOnly) {
|
||||
std::cout << "Running in scale mode." << std::endl;
|
||||
run<GemmScaleOnly>(options);
|
||||
}
|
||||
else if (options.mode == MixedDtypeGemmMode::ScaleWithZeroPoint) {
|
||||
std::cout << "Running in scale and zero mode." << std::endl;
|
||||
run<GemmScaleWithZeroPoint>(options);
|
||||
}
|
||||
else{
|
||||
std::cerr << "Invalid mode " << options.mode << ". Must be 0, 1 or 2." << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
45
examples/86_blackwell_mixed_dtype_gemm/CMakeLists.txt
Normal file
45
examples/86_blackwell_mixed_dtype_gemm/CMakeLists.txt
Normal file
@ -0,0 +1,45 @@
|
||||
# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
set(TEST_S_TILE_SHAPE --m=256 --n=128 --k=32 --verify --iterations=0)
|
||||
set(TEST_S_TILE_SHAPE_MULTIPLE_KITER --m=256 --n=128 --k=128 --verify --iterations=0)
|
||||
set(TEST_S_DIFFERENT_MN --m=16384 --n=4608 --k=4608 --verify --iterations=0)
|
||||
set(TEST_S_ONE_WAVE --m=1536 --n=1536 --k=32 --verify --iterations=0) # Assuming 144 SMs
|
||||
set(TEST_S_2048 --m=2048 --n=2048 --k=2048 --verify --iterations=0) # Multi-wave
|
||||
|
||||
if(NOT WIN32)
|
||||
cutlass_example_add_executable(
|
||||
86_blackwell_mixed_dtype_gemm
|
||||
86_blackwell_mixed_dtype.cu
|
||||
TEST_COMMAND_OPTIONS
|
||||
TEST_S_TILE_SHAPE
|
||||
TEST_S_TILE_SHAPE_MULTIPLE_KITER
|
||||
TEST_S_ONE_WAVE
|
||||
TEST_S_2048
|
||||
)
|
||||
endif()
|
||||
269
examples/86_blackwell_mixed_dtype_gemm/mixed_dtype_helper.cuh
Normal file
269
examples/86_blackwell_mixed_dtype_gemm/mixed_dtype_helper.cuh
Normal file
@ -0,0 +1,269 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/default_epilogue.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/util/command_line.h"
|
||||
#include "cutlass/util/reference/device/tensor_fill.h"
|
||||
#include "cutlass/util/reference/device/tensor_compare.h"
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
|
||||
#include <cuda.h>
|
||||
#include <numeric>
|
||||
#include "helper.h"
|
||||
|
||||
enum MixedDtypeGemmMode {
|
||||
ConvertOnly,
|
||||
ScaleOnly,
|
||||
ScaleWithZeroPoint
|
||||
};
|
||||
|
||||
/// Command line options parsing
|
||||
struct MixedDtypeOptions {
|
||||
|
||||
bool help = false;
|
||||
bool verify = false;
|
||||
|
||||
float alpha = 1.0f;
|
||||
float beta = 0.0f;
|
||||
int iterations = 1000;
|
||||
int warmup = 1000;
|
||||
int mode = 1;
|
||||
int m = 5120, n = 4096, k = 4096;
|
||||
int l = 1;
|
||||
|
||||
// Parses the command line
|
||||
void parse(int argc, char const **args) {
|
||||
cutlass::CommandLine cmd(argc, args);
|
||||
|
||||
if (cmd.check_cmd_line_flag("help")) {
|
||||
help = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (cmd.check_cmd_line_flag("verify")) {
|
||||
verify = true;
|
||||
}
|
||||
|
||||
cmd.get_cmd_line_argument("m", m);
|
||||
cmd.get_cmd_line_argument("n", n);
|
||||
cmd.get_cmd_line_argument("k", k);
|
||||
cmd.get_cmd_line_argument("l", l);
|
||||
cmd.get_cmd_line_argument("mode", mode);
|
||||
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
|
||||
cmd.get_cmd_line_argument("beta", beta, 0.f);
|
||||
cmd.get_cmd_line_argument("iterations", iterations);
|
||||
cmd.get_cmd_line_argument("warmup", warmup);
|
||||
}
|
||||
|
||||
/// Prints the usage statement.
|
||||
std::ostream & print_usage(std::ostream &out) const {
|
||||
|
||||
out << "86_blackwell_mixed_dtype_gemm\n\n"
|
||||
<< " Blackwell Mixed Data Type GEMM using a Warp Specialized kernel.\n\n"
|
||||
<< "Options:\n\n"
|
||||
<< " --help If specified, displays this usage statement\n\n"
|
||||
<< " --m=<int> Sets the M extent of the GEMM\n"
|
||||
<< " --n=<int> Sets the N extent of the GEMM\n"
|
||||
<< " --k=<int> Sets the K extent of the GEMM\n"
|
||||
<< " --l=<int> The number of independent gemm problems with mnk shape\n"
|
||||
<< " --mode=<int> The mode to run the gemm. 0 does (A @ B), 1 means A @ (scale * B), 2 means A @ (scale * B + zero-point).\n"
|
||||
<< " --alpha=<f32> Epilogue scalar alpha\n"
|
||||
<< " --beta=<f32> Epilogue scalar beta\n\n"
|
||||
<< " --iterations=<int> Number of profiling iterations to perform.\n\n"
|
||||
<< " --warmup=<int> Number of warmup iterations to perform.\n\n"
|
||||
<< " --verify=<int> Run verification.\n\n";
|
||||
|
||||
out
|
||||
<< "\n\nExamples:\n\n"
|
||||
<< "$ " << "86_blackwell_mixed_dtype_gemm" << " --m=1024 --n=512 --k=1024 --l=10 --alpha=2 --mode=2 --beta=0.707 \n\n";
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/// Compute performance in GFLOP/s
|
||||
double gflops(double runtime_s) const
|
||||
{
|
||||
// Two flops per multiply-add
|
||||
uint64_t flop = uint64_t(2) * m * n * k * l;
|
||||
double gflop = double(flop) / double(1.0e9);
|
||||
return gflop / runtime_s;
|
||||
}
|
||||
};
|
||||
|
||||
/// Result structure
|
||||
struct MixedDtypeResult
|
||||
{
|
||||
double avg_runtime_ms = 0.0;
|
||||
double gflops = 0.0;
|
||||
cutlass::Status status = cutlass::Status::kSuccess;
|
||||
cudaError_t error = cudaSuccess;
|
||||
bool passed = false;
|
||||
|
||||
};
|
||||
|
||||
/// Profiling Loop
|
||||
template <class Gemm>
|
||||
void mixed_dtype_profiling(
|
||||
Gemm& gemm,
|
||||
MixedDtypeOptions const& options,
|
||||
MixedDtypeResult& result) {
|
||||
|
||||
if (options.iterations <= 0) return;
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
std::vector<float> runtimes;
|
||||
runtimes.reserve(options.iterations);
|
||||
|
||||
for (int iter = 0; iter < options.warmup + options.iterations; ++iter) {
|
||||
cudaEventRecord(start);
|
||||
CUTLASS_CHECK(gemm.run());
|
||||
cudaEventRecord(stop);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
if (iter >= options.warmup) {
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
runtimes.push_back(milliseconds);
|
||||
}
|
||||
}
|
||||
|
||||
cudaEventDestroy(start);
|
||||
cudaEventDestroy(stop);
|
||||
|
||||
// Compute average setup and runtime and GFLOPs.
|
||||
result.avg_runtime_ms = std::accumulate(runtimes.begin(), runtimes.end(), 0.0f) / runtimes.size();
|
||||
result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
|
||||
|
||||
std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
|
||||
std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
|
||||
std::cout << " GFLOPS: " << result.gflops << std::endl;
|
||||
|
||||
}
|
||||
|
||||
/// Helpers to initialize a block of device data
|
||||
template <class Element>
|
||||
bool initialize_tensor(
|
||||
cutlass::DeviceAllocation<Element>& block,
|
||||
uint64_t seed = 2023) {
|
||||
|
||||
double scope_max, scope_min;
|
||||
int bits_input = cutlass::sizeof_bits<Element>::value;
|
||||
int bits_output = cutlass::sizeof_bits<Element>::value;
|
||||
|
||||
if (bits_input == 1) {
|
||||
scope_max = 2;
|
||||
scope_min = 0;
|
||||
}
|
||||
else if (bits_input <= 8) {
|
||||
scope_max = 2;
|
||||
scope_min = -2;
|
||||
}
|
||||
else if (bits_output == 16) {
|
||||
scope_max = 5;
|
||||
scope_min = -5;
|
||||
}
|
||||
else {
|
||||
scope_max = 8;
|
||||
scope_min = -8;
|
||||
}
|
||||
cutlass::reference::device::BlockFillRandomUniform(
|
||||
block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename Element>
|
||||
bool initialize_quant_tensor(
|
||||
cutlass::DeviceAllocation<Element>& block,
|
||||
uint64_t seed = 2023) {
|
||||
|
||||
float scope_min = float(cutlass::platform::numeric_limits<Element>::lowest());
|
||||
float scope_max = float(cutlass::platform::numeric_limits<Element>::max());
|
||||
|
||||
cutlass::reference::device::BlockFillRandomUniform(
|
||||
block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class QuantType, class Element>
|
||||
bool initialize_scale(
|
||||
cutlass::DeviceAllocation<Element>& block,
|
||||
MixedDtypeOptions const& options,
|
||||
uint64_t seed = 2023) {
|
||||
|
||||
if (options.mode == MixedDtypeGemmMode::ConvertOnly) {
|
||||
// No scales, so just initialize with 1 so we can use the same kernel to dequantize the data.
|
||||
std::vector<Element> stage(block.size(), Element(1.0f));
|
||||
block.copy_from_host(stage.data());
|
||||
}
|
||||
else {
|
||||
float elt_max_f = float(cutlass::platform::numeric_limits<QuantType>::max());
|
||||
const float max_dequant_val = 4.f;
|
||||
const float min_dequant_val = 0.5f;
|
||||
|
||||
float scope_max(max_dequant_val / elt_max_f);
|
||||
float scope_min(min_dequant_val / elt_max_f);
|
||||
|
||||
cutlass::reference::device::BlockFillRandomUniform(
|
||||
block.get(), block.size(), seed, Element(scope_max), Element(scope_min));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class Element>
|
||||
bool initialize_zero(
|
||||
cutlass::DeviceAllocation<Element>& block,
|
||||
MixedDtypeOptions const& options,
|
||||
uint64_t seed = 2023) {
|
||||
|
||||
if (options.mode == MixedDtypeGemmMode::ScaleWithZeroPoint) {
|
||||
cutlass::reference::device::BlockFillRandomUniform(
|
||||
block.get(), block.size(), seed, Element(2.0f), Element(-2.0f));
|
||||
} else {
|
||||
// No bias, so just initialize with 1 so we can use the same kernel to dequantize the data.
|
||||
std::vector<Element> stage(block.size(), Element(0.0f));
|
||||
block.copy_from_host(stage.data());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -0,0 +1,518 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
\brief An FP8 blockwise scaled GEMM example for the NVIDIA Blackwell SM120 architecture using CUTLASS.
|
||||
|
||||
This example demonstrates a simple way to instantiate and run a blockwise scaling FP8 GEMM on the NVIDIA Blackwell SM120 architecture.
|
||||
This kernel is optimized for the GeForce RTX 50 series GPUs.
|
||||
|
||||
This kernel accepts Inputs A and B with TileMxTileK and TileNxTileK FP32 block scaling, performing scaling and accumulation every TileK elements.
|
||||
Similar to 79a_blackwell_geforce_nvfp4_bf16_gemm, this kernel leverages:
|
||||
|
||||
1. Warp-Specialized persistent kernel design that supports both cooperative and ping-pong kernel schedule introduced in Hopper.
|
||||
2. The new SW controlled dynamic scheduler based on cluster launch control (See https://docs.nvidia.com/cuda/parallel-thread-execution).
|
||||
3. Epilogue Optimization
|
||||
|
||||
Note that GeForce RTX 50 series GPUs do not support:
|
||||
1. Multicast feature of TMA load. Cluster shape has to be 1x1x1.
|
||||
2. Dynamic datatypes.
|
||||
3. Runtime scaling block size.
|
||||
|
||||
Usage:
|
||||
|
||||
$ ./examples/87_blackwell_geforce_gemm_blockwise/87a_blackwell_geforce_fp8_bf16_gemm_blockwise --m=2048 --n=2048 --k=2048
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/tensor_ref.h"
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/kernel/tile_scheduler_params.h"
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
#include "cutlass/util/distribution.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/packed_stride.hpp"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/reference/host/tensor_copy.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_norm.h"
|
||||
|
||||
#include "cutlass/util/reference/host/gett.hpp"
|
||||
|
||||
#include "helper.h"
|
||||
|
||||
#include "./utils.h"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM kernel configurations
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::float_e4m3_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::float_e4m3_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// C/D matrix configuration
|
||||
using ElementC = cutlass::bfloat16_t; // Element type for C and D matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C and D matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
|
||||
using ElementD = ElementC;
|
||||
using LayoutD = LayoutC;
|
||||
constexpr int AlignmentD = AlignmentC;
|
||||
|
||||
// MMA type
|
||||
using ElementAccumulator = float; // Element Accumulator will also be our scale factor type
|
||||
using ElementCompute = float;
|
||||
|
||||
|
||||
// MMA and Cluster Tile Shapes
|
||||
// Shape of the tile
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Shape of the threadblocks in a cluster
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
|
||||
using ScaleConfig = decltype(cutlass::detail::sm120_trivial_blockwise_scale_config(MmaTileShape_MNK{}));
|
||||
|
||||
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); // Layout type for SFA matrix operand
|
||||
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); // Layout type for SFB matrix operand
|
||||
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutC, AlignmentD,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, cute::tuple<LayoutA, LayoutSFA>, AlignmentA,
|
||||
ElementB, cute::tuple<LayoutB, LayoutSFB>, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
void>; // Default to ClusterLaunchControl (CLC) based tile scheduler
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
using StrideA = typename Gemm::GemmKernel::StrideA;
|
||||
using StrideB = typename Gemm::GemmKernel::StrideB;
|
||||
using StrideC = typename Gemm::GemmKernel::StrideC;
|
||||
using StrideD = typename Gemm::GemmKernel::StrideD;
|
||||
|
||||
/// Initialization
|
||||
StrideA stride_A;
|
||||
StrideB stride_B;
|
||||
StrideC stride_C;
|
||||
StrideD stride_D;
|
||||
// Strides just iterate over scalars and have no zeros
|
||||
LayoutSFA layout_SFA;
|
||||
LayoutSFB layout_SFB;
|
||||
// Layouts are tiled to the problem size and the strides have zeros
|
||||
uint64_t seed;
|
||||
|
||||
cutlass::HostTensor<ElementA , LayoutA> tensor_A;
|
||||
cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout> tensor_SFA;
|
||||
cutlass::HostTensor<ElementB , LayoutB> tensor_B;
|
||||
cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout> tensor_SFB;
|
||||
cutlass::HostTensor<ElementC , LayoutC> tensor_C;
|
||||
cutlass::HostTensor<ElementD , LayoutD> tensor_D;
|
||||
cutlass::HostTensor<ElementD , LayoutD> tensor_ref_D;
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Testbed utility types
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Command line options parsing
|
||||
struct Options {
|
||||
|
||||
bool help = false;
|
||||
bool skip_verification = false;
|
||||
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
int iterations = 1000;
|
||||
int m = 1024, n = 512, k = 1024, l = 1;
|
||||
|
||||
// Parses the command line
|
||||
void parse(int argc, char const **args) {
|
||||
cutlass::CommandLine cmd(argc, args);
|
||||
|
||||
if (cmd.check_cmd_line_flag("help")) {
|
||||
help = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (cmd.check_cmd_line_flag("skip-verification")) {
|
||||
skip_verification = true;
|
||||
}
|
||||
|
||||
cmd.get_cmd_line_argument("m", m);
|
||||
cmd.get_cmd_line_argument("n", n);
|
||||
cmd.get_cmd_line_argument("k", k);
|
||||
cmd.get_cmd_line_argument("l", l);
|
||||
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
|
||||
cmd.get_cmd_line_argument("beta", beta, 0.f);
|
||||
cmd.get_cmd_line_argument("iterations", iterations);
|
||||
}
|
||||
|
||||
/// Prints the usage statement.
|
||||
std::ostream & print_usage(std::ostream &out) const {
|
||||
|
||||
out << "87a_blackwell_geforce_gemm_blockwise\n\n"
|
||||
<< " Blackwell FP8 GEMM with Blockwise Scaling using a Warp Specialized kernel.\n\n"
|
||||
<< "Options:\n\n"
|
||||
<< " --help If specified, displays this usage statement\n\n"
|
||||
<< " --m=<int> Sets the M extent of the GEMM\n"
|
||||
<< " --n=<int> Sets the N extent of the GEMM\n"
|
||||
<< " --k=<int> Sets the K extent of the GEMM\n"
|
||||
<< " --l=<int> Sets the l extent (batch) of the GEMM\n"
|
||||
<< " --alpha=<f32> Epilogue scalar alpha\n"
|
||||
<< " --beta=<f32> Epilogue scalar beta\n"
|
||||
<< " --iterations=<int> Number of profiling iterations to perform.\n\n"
|
||||
<< " --skip-verification Skip verification.\n\n";
|
||||
|
||||
out
|
||||
<< "\n\nExamples:\n\n"
|
||||
<< "$ " << "87a_blackwell_geforce_gemm_blockwise" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/// Compute performance in GFLOP/s
|
||||
double gflops(double runtime_s) const {
|
||||
// Two flops per multiply-add
|
||||
uint64_t flop = uint64_t(2) * m * n * k;
|
||||
double gflop = double(flop) / double(1.0e9);
|
||||
return gflop / runtime_s;
|
||||
}
|
||||
};
|
||||
|
||||
/// Result structure
|
||||
struct Result {
|
||||
double avg_runtime_ms;
|
||||
double gflops;
|
||||
cutlass::Status status;
|
||||
cudaError_t error;
|
||||
bool passed;
|
||||
|
||||
Result(
|
||||
double avg_runtime_ms = 0,
|
||||
double gflops = 0,
|
||||
cutlass::Status status = cutlass::Status::kSuccess,
|
||||
cudaError_t error = cudaSuccess)
|
||||
:
|
||||
avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
|
||||
{}
|
||||
|
||||
};
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM setup and evaluation
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Initialize operands to be used in the GEMM and reference GEMM
|
||||
void initialize(const Options &options) {
|
||||
using namespace cute;
|
||||
|
||||
auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
|
||||
|
||||
stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
|
||||
stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
|
||||
stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.m, options.n, options.l));
|
||||
stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
|
||||
|
||||
layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(options.m, options.n, options.k, options.l));
|
||||
layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(options.m, options.n, options.k, options.l));
|
||||
|
||||
auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
|
||||
auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
|
||||
auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
|
||||
auto blockscale_a_coord = cutlass::make_Coord(size(filter_zeros(layout_SFA)));
|
||||
auto blockscale_b_coord = cutlass::make_Coord(size(filter_zeros(layout_SFB)));
|
||||
|
||||
tensor_A.resize(a_coord);
|
||||
tensor_B.resize(b_coord);
|
||||
tensor_C.resize(c_coord);
|
||||
tensor_D.resize(c_coord);
|
||||
tensor_ref_D.resize(c_coord);
|
||||
tensor_SFA.resize(blockscale_a_coord);
|
||||
tensor_SFB.resize(blockscale_b_coord);
|
||||
|
||||
initialize_tensor(tensor_A.host_view(), cutlass::Distribution::Uniform, seed + 2022);
|
||||
initialize_tensor(tensor_B.host_view(), cutlass::Distribution::Uniform, seed + 2023);
|
||||
initialize_tensor(tensor_C.host_view(), cutlass::Distribution::Uniform, seed + 2024);
|
||||
|
||||
initialize_tensor(tensor_SFA.host_view(), cutlass::Distribution::Uniform, seed + 2025);
|
||||
initialize_tensor(tensor_SFB.host_view(), cutlass::Distribution::Uniform, seed + 2026);
|
||||
|
||||
tensor_A.sync_device();
|
||||
tensor_B.sync_device();
|
||||
tensor_C.sync_device();
|
||||
tensor_D.sync_device();
|
||||
|
||||
tensor_SFA.sync_device();
|
||||
tensor_SFB.sync_device();
|
||||
|
||||
}
|
||||
|
||||
/// Populates a Gemm::Arguments structure from the given commandline options
|
||||
typename Gemm::Arguments args_from_options(const Options &options) {
|
||||
typename Gemm::Arguments arguments{
|
||||
cutlass::gemm::GemmUniversalMode::kGemm,
|
||||
{options.m, options.n, options.k, options.l},
|
||||
{tensor_A.device_data(), stride_A,
|
||||
tensor_B.device_data(), stride_B,
|
||||
tensor_SFA.device_data(), layout_SFA,
|
||||
tensor_SFB.device_data(), layout_SFB},
|
||||
{
|
||||
{}, // epilogue.thread
|
||||
tensor_C.device_data(), stride_C,
|
||||
tensor_D.device_data(), stride_D
|
||||
}
|
||||
};
|
||||
|
||||
auto &fusion_args = arguments.epilogue.thread;
|
||||
fusion_args.alpha = options.alpha;
|
||||
fusion_args.beta = options.beta;
|
||||
|
||||
return arguments;
|
||||
}
|
||||
|
||||
bool verify(Options const& options) {
|
||||
//
|
||||
// Compute reference output
|
||||
//
|
||||
|
||||
// Create instantiation for device reference gemm kernel
|
||||
auto A = cute::make_tensor(tensor_A.host_data(),
|
||||
cute::make_layout(cute::make_shape(options.m, options.k, options.l), stride_A));
|
||||
auto B = cute::make_tensor(tensor_B.host_data(),
|
||||
cute::make_layout(cute::make_shape(options.n, options.k, options.l), stride_B));
|
||||
auto C = cute::make_tensor(tensor_C.host_data(),
|
||||
cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_C));
|
||||
auto D = cute::make_tensor(tensor_ref_D.host_data(),
|
||||
cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_D));
|
||||
auto SFA = cute::make_tensor(tensor_SFA.host_data(), layout_SFA);
|
||||
auto SFB = cute::make_tensor(tensor_SFB.host_data(), layout_SFB);
|
||||
|
||||
using unused_t = decltype(D);
|
||||
|
||||
cutlass::reference::host::GettBlockScalingMainloopParams<
|
||||
ElementAccumulator,
|
||||
decltype(A),
|
||||
decltype(SFA),
|
||||
decltype(B),
|
||||
decltype(SFB)
|
||||
> mainloop_params{A, SFA, B, SFB};
|
||||
|
||||
cutlass::reference::host::GettEpilogueParams<
|
||||
ElementAccumulator,
|
||||
ElementAccumulator,
|
||||
ElementAccumulator,
|
||||
ElementCompute,
|
||||
decltype(C),
|
||||
decltype(D)
|
||||
> epilogue_params;
|
||||
|
||||
epilogue_params.C = C;
|
||||
epilogue_params.D = D;
|
||||
epilogue_params.alpha = options.alpha;
|
||||
epilogue_params.beta = options.beta;
|
||||
|
||||
// get reference result
|
||||
cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
|
||||
|
||||
// compare_reference
|
||||
tensor_D.sync_host();
|
||||
bool passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
|
||||
|
||||
return passed;
|
||||
}
|
||||
|
||||
/// Execute a given example GEMM computation
|
||||
template <class Gemm>
|
||||
int run(Options &options) {
|
||||
initialize(options);
|
||||
|
||||
|
||||
// Instantiate CUTLASS kernel depending on templates
|
||||
Gemm gemm;
|
||||
|
||||
// Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
|
||||
auto arguments = args_from_options(options);
|
||||
|
||||
// Using the arguments, query for extra workspace required for matrix multiplication computation
|
||||
size_t workspace_size = Gemm::get_workspace_size(arguments);
|
||||
|
||||
// Allocate workspace memory
|
||||
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
|
||||
|
||||
|
||||
// Check if the problem size is supported or not
|
||||
CUTLASS_CHECK(gemm.can_implement(arguments));
|
||||
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
|
||||
|
||||
|
||||
// Correctness / Warmup iteration
|
||||
CUTLASS_CHECK(gemm.run());
|
||||
|
||||
Result result;
|
||||
if (!options.skip_verification) {
|
||||
// Check if output from CUTLASS kernel and reference kernel are equal or not
|
||||
result.passed = verify(options);
|
||||
|
||||
std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
|
||||
|
||||
if (!result.passed) {
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
// Run profiling loop
|
||||
if (options.iterations > 0) {
|
||||
GpuTimer timer;
|
||||
timer.start();
|
||||
for (int iter = 0; iter < options.iterations; ++iter) {
|
||||
CUTLASS_CHECK(gemm.run());
|
||||
}
|
||||
timer.stop();
|
||||
|
||||
// Compute average runtime and GFLOPs.
|
||||
float elapsed_ms = timer.elapsed_millis();
|
||||
result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
|
||||
result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
|
||||
|
||||
std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
|
||||
std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
|
||||
std::cout << " GFLOPS: " << result.gflops << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
|
||||
// or CUDA 12.9 or higher for SM121 support.
|
||||
// Must have compute capability at least 120.
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
|
||||
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
|
||||
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaDeviceProp props;
|
||||
int current_device_id;
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
|
||||
std::cerr << "This example requires a GPU with compute capability 120a or 121a)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Parse options
|
||||
//
|
||||
|
||||
Options options;
|
||||
|
||||
options.parse(argc, args);
|
||||
|
||||
if (options.help) {
|
||||
options.print_usage(std::cout) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Run
|
||||
//
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
run<Gemm>(options);
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -0,0 +1,539 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
\brief An FP8 groupwise scaled GEMM example for the NVIDIA Blackwell SM120 architecture using CUTLASS.
|
||||
|
||||
This example demonstrates a simple way to instantiate and run cooperative and ping-pong groupwise scaling FP8 GEMMs on the NVIDIA Blackwell SM120 architecture.
|
||||
These kernels are optimized for GeForce RTX 50 series GPUs.
|
||||
|
||||
The blockscaling kernels accept Inputs A and B with 1xTileK and TileNxTileK FP32 block scaling, performing scaling and accumulation every TileK elements.
|
||||
The ping-pong kernel leverages a smaller tile shape to avoid register spilling for better performance.
|
||||
Similar to 79a_blackwell_geforce_nvfp4_bf16_gemm, this kernel leverages:
|
||||
|
||||
1. Warp-Specialized persistent kernel design that supports both cooperative and ping-pong kernel schedule introduced in Hopper.
|
||||
2. The new SW controlled dynamic scheduler based on cluster launch control (See https://docs.nvidia.com/cuda/parallel-thread-execution).
|
||||
3. Epilogue Optimization
|
||||
|
||||
Note that GeForce RTX 50 series GPUs do not support:
|
||||
1. Multicast feature of TMA load. Cluster shape has to be 1x1x1.
|
||||
2. Dynamic datatypes.
|
||||
3. Runtime scaling block size.
|
||||
|
||||
Usage:
|
||||
|
||||
$ ./examples/87_blackwell_geforce_gemm_blockwise/87b_blackwell_geforce_fp8_bf16_gemm_groupwise --m=2048 --n=2048 --k=2048
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/tensor_ref.h"
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/kernel/tile_scheduler_params.h"
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
#include "cutlass/util/distribution.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/packed_stride.hpp"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/reference/host/tensor_copy.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_norm.h"
|
||||
|
||||
#include "cutlass/util/reference/host/gett.hpp"
|
||||
|
||||
#include "helper.h"
|
||||
|
||||
#include "./utils.h"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM kernel configurations
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::float_e4m3_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::float_e4m3_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// C/D matrix configuration
|
||||
using ElementC = cutlass::bfloat16_t; // Element type for C and D matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C and D matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
|
||||
using ElementD = ElementC;
|
||||
using LayoutD = LayoutC;
|
||||
constexpr int AlignmentD = AlignmentC;
|
||||
|
||||
// MMA type
|
||||
using ElementAccumulator = float; // Element Accumulator will also be our scale factor type
|
||||
using ElementCompute = float;
|
||||
|
||||
|
||||
// MMA and Cluster Tile Shapes
|
||||
// Shape of the tile
|
||||
using CooperativeMmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Smaller tile size for pingpong schedule to avoid register spilling
|
||||
using PingpongMmaTileShape_MNK = Shape<_64, _128, _128>;
|
||||
// Shape of the threadblocks in a cluster
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
|
||||
constexpr int ScaleGranularityM = 1;
|
||||
constexpr int ScaleGranularityN = 128;
|
||||
constexpr int ScaleGranularityK = 128;
|
||||
using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
|
||||
|
||||
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); // Layout type for SFA matrix operand
|
||||
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); // Layout type for SFB matrix operand
|
||||
|
||||
|
||||
template <class TileShape>
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
|
||||
TileShape, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC, AlignmentC,
|
||||
ElementD, LayoutC, AlignmentD,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
template <class TileShape, class Schedule>
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, cute::tuple<LayoutA, LayoutSFA>, AlignmentA,
|
||||
ElementB, cute::tuple<LayoutB, LayoutSFB>, AlignmentB,
|
||||
ElementAccumulator,
|
||||
TileShape, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue<TileShape>::SharedStorage))>,
|
||||
Schedule // cutlass::gemm::KernelTmaWarpSpecializedBlockwisePingpongSm120
|
||||
>::CollectiveOp;
|
||||
|
||||
template <class TileShape, class Schedule>
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>,
|
||||
CollectiveMainloop<TileShape, Schedule>,
|
||||
CollectiveEpilogue<TileShape>,
|
||||
void>; // Default to ClusterLaunchControl (CLC) based tile scheduler
|
||||
|
||||
// We are using cooperative kernel schedule by default
|
||||
using CooperativeGemm = cutlass::gemm::device::GemmUniversalAdapter<
|
||||
GemmKernel<CooperativeMmaTileShape_MNK, cutlass::gemm::KernelScheduleSm120Blockwise>>;
|
||||
|
||||
// Pingpong kernel
|
||||
using PingpongGemm = cutlass::gemm::device::GemmUniversalAdapter<
|
||||
GemmKernel<PingpongMmaTileShape_MNK, cutlass::gemm::KernelTmaWarpSpecializedBlockwisePingpongSm120>>;
|
||||
|
||||
using StrideA = typename CooperativeGemm::GemmKernel::StrideA;
|
||||
using StrideB = typename CooperativeGemm::GemmKernel::StrideB;
|
||||
using StrideC = typename CooperativeGemm::GemmKernel::StrideC;
|
||||
using StrideD = typename CooperativeGemm::GemmKernel::StrideD;
|
||||
|
||||
/// Initialization
|
||||
StrideA stride_A;
|
||||
StrideB stride_B;
|
||||
StrideC stride_C;
|
||||
StrideD stride_D;
|
||||
// Strides just iterate over scalars and have no zeros
|
||||
LayoutSFA layout_SFA;
|
||||
LayoutSFB layout_SFB;
|
||||
// Layouts are tiled to the problem size and the strides have zeros
|
||||
uint64_t seed;
|
||||
|
||||
cutlass::HostTensor<ElementA , LayoutA> tensor_A;
|
||||
cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout> tensor_SFA;
|
||||
cutlass::HostTensor<ElementB , LayoutB> tensor_B;
|
||||
cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout> tensor_SFB;
|
||||
cutlass::HostTensor<ElementC , LayoutC> tensor_C;
|
||||
cutlass::HostTensor<ElementD , LayoutD> tensor_D;
|
||||
cutlass::HostTensor<ElementD , LayoutD> tensor_ref_D;
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Testbed utility types
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Command line options parsing
|
||||
struct Options {
|
||||
|
||||
bool help = false;
|
||||
bool skip_verification = false;
|
||||
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
int iterations = 1000;
|
||||
int m = 1024, n = 512, k = 1024, l = 1;
|
||||
|
||||
// Parses the command line
|
||||
void parse(int argc, char const **args) {
|
||||
cutlass::CommandLine cmd(argc, args);
|
||||
|
||||
if (cmd.check_cmd_line_flag("help")) {
|
||||
help = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (cmd.check_cmd_line_flag("skip-verification")) {
|
||||
skip_verification = true;
|
||||
}
|
||||
|
||||
cmd.get_cmd_line_argument("m", m);
|
||||
cmd.get_cmd_line_argument("n", n);
|
||||
cmd.get_cmd_line_argument("k", k);
|
||||
cmd.get_cmd_line_argument("l", l);
|
||||
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
|
||||
cmd.get_cmd_line_argument("beta", beta, 0.f);
|
||||
cmd.get_cmd_line_argument("iterations", iterations);
|
||||
}
|
||||
|
||||
/// Prints the usage statement.
|
||||
std::ostream & print_usage(std::ostream &out) const {
|
||||
|
||||
out << "87b_blackwell_geforce_gemm_groupwise\n\n"
|
||||
<< " Blackwell FP8 GEMM with Blockwise Scaling using a Warp Specialized kernel.\n\n"
|
||||
<< "Options:\n\n"
|
||||
<< " --help If specified, displays this usage statement\n\n"
|
||||
<< " --m=<int> Sets the M extent of the GEMM\n"
|
||||
<< " --n=<int> Sets the N extent of the GEMM\n"
|
||||
<< " --k=<int> Sets the K extent of the GEMM\n"
|
||||
<< " --l=<int> Sets the l extent (batch) of the GEMM\n"
|
||||
<< " --alpha=<f32> Epilogue scalar alpha\n"
|
||||
<< " --beta=<f32> Epilogue scalar beta\n"
|
||||
<< " --iterations=<int> Number of profiling iterations to perform.\n\n"
|
||||
<< " --skip-verification Skip verification.\n\n";
|
||||
|
||||
out
|
||||
<< "\n\nExamples:\n\n"
|
||||
<< "$ " << "87b_blackwell_geforce_gemm_groupwise" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/// Compute performance in GFLOP/s
|
||||
double gflops(double runtime_s) const {
|
||||
// Two flops per multiply-add
|
||||
uint64_t flop = uint64_t(2) * m * n * k;
|
||||
double gflop = double(flop) / double(1.0e9);
|
||||
return gflop / runtime_s;
|
||||
}
|
||||
};
|
||||
|
||||
/// Result structure
|
||||
struct Result {
|
||||
double avg_runtime_ms;
|
||||
double gflops;
|
||||
cutlass::Status status;
|
||||
cudaError_t error;
|
||||
bool passed;
|
||||
|
||||
Result(
|
||||
double avg_runtime_ms = 0,
|
||||
double gflops = 0,
|
||||
cutlass::Status status = cutlass::Status::kSuccess,
|
||||
cudaError_t error = cudaSuccess)
|
||||
:
|
||||
avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
|
||||
{}
|
||||
|
||||
};
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM setup and evaluation
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
/// Initialize operands to be used in the GEMM and reference GEMM
|
||||
void initialize(const Options &options) {
|
||||
using namespace cute;
|
||||
|
||||
auto gemm_problem_shape = cute::make_shape(options.m, options.n, options.k);
|
||||
|
||||
stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(options.m, options.k, options.l));
|
||||
stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(options.n, options.k, options.l));
|
||||
stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(options.m, options.n, options.l));
|
||||
stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(options.m, options.n, options.l));
|
||||
|
||||
layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(options.m, options.n, options.k, options.l));
|
||||
layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(options.m, options.n, options.k, options.l));
|
||||
|
||||
auto a_coord = cutlass::make_Coord(options.m * options.l, options.k);
|
||||
auto c_coord = cutlass::make_Coord(options.m * options.l, options.n);
|
||||
auto b_coord = cutlass::make_Coord(options.k, options.n * options.l);
|
||||
auto blockscale_a_coord = cutlass::make_Coord(size(filter_zeros(layout_SFA)));
|
||||
auto blockscale_b_coord = cutlass::make_Coord(size(filter_zeros(layout_SFB)));
|
||||
|
||||
tensor_A.resize(a_coord);
|
||||
tensor_B.resize(b_coord);
|
||||
tensor_C.resize(c_coord);
|
||||
tensor_D.resize(c_coord);
|
||||
tensor_ref_D.resize(c_coord);
|
||||
tensor_SFA.resize(blockscale_a_coord);
|
||||
tensor_SFB.resize(blockscale_b_coord);
|
||||
|
||||
initialize_tensor(tensor_A.host_view(), cutlass::Distribution::Uniform, seed + 2022);
|
||||
initialize_tensor(tensor_B.host_view(), cutlass::Distribution::Uniform, seed + 2023);
|
||||
initialize_tensor(tensor_C.host_view(), cutlass::Distribution::Uniform, seed + 2024);
|
||||
|
||||
initialize_tensor(tensor_SFA.host_view(), cutlass::Distribution::Uniform, seed + 2025);
|
||||
initialize_tensor(tensor_SFB.host_view(), cutlass::Distribution::Uniform, seed + 2026);
|
||||
|
||||
tensor_A.sync_device();
|
||||
tensor_B.sync_device();
|
||||
tensor_C.sync_device();
|
||||
tensor_D.sync_device();
|
||||
|
||||
tensor_SFA.sync_device();
|
||||
tensor_SFB.sync_device();
|
||||
|
||||
}
|
||||
|
||||
/// Populates a Gemm::Arguments structure from the given commandline options
|
||||
template <class Gemm>
|
||||
typename Gemm::Arguments args_from_options(const Options &options) {
|
||||
typename Gemm::Arguments arguments{
|
||||
cutlass::gemm::GemmUniversalMode::kGemm,
|
||||
{options.m, options.n, options.k, options.l},
|
||||
{tensor_A.device_data(), stride_A,
|
||||
tensor_B.device_data(), stride_B,
|
||||
tensor_SFA.device_data(), layout_SFA,
|
||||
tensor_SFB.device_data(), layout_SFB},
|
||||
{
|
||||
{}, // epilogue.thread
|
||||
tensor_C.device_data(), stride_C,
|
||||
tensor_D.device_data(), stride_D
|
||||
}
|
||||
};
|
||||
|
||||
auto &fusion_args = arguments.epilogue.thread;
|
||||
fusion_args.alpha = options.alpha;
|
||||
fusion_args.beta = options.beta;
|
||||
|
||||
return arguments;
|
||||
}
|
||||
|
||||
bool verify(const Options &options) {
|
||||
//
|
||||
// Compute reference output
|
||||
//
|
||||
|
||||
// Create instantiation for device reference gemm kernel
|
||||
auto A = cute::make_tensor(tensor_A.host_data(),
|
||||
cute::make_layout(cute::make_shape(options.m, options.k, options.l), stride_A));
|
||||
auto B = cute::make_tensor(tensor_B.host_data(),
|
||||
cute::make_layout(cute::make_shape(options.n, options.k, options.l), stride_B));
|
||||
auto C = cute::make_tensor(tensor_C.host_data(),
|
||||
cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_C));
|
||||
auto D = cute::make_tensor(tensor_ref_D.host_data(),
|
||||
cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_D));
|
||||
auto SFA = cute::make_tensor(tensor_SFA.host_data(), layout_SFA);
|
||||
auto SFB = cute::make_tensor(tensor_SFB.host_data(), layout_SFB);
|
||||
|
||||
using unused_t = decltype(D);
|
||||
|
||||
cutlass::reference::host::GettBlockScalingMainloopParams<
|
||||
ElementAccumulator,
|
||||
decltype(A),
|
||||
decltype(SFA),
|
||||
decltype(B),
|
||||
decltype(SFB)
|
||||
> mainloop_params{A, SFA, B, SFB};
|
||||
|
||||
cutlass::reference::host::GettEpilogueParams<
|
||||
ElementAccumulator,
|
||||
ElementAccumulator,
|
||||
ElementAccumulator,
|
||||
ElementCompute,
|
||||
decltype(C),
|
||||
decltype(D)
|
||||
> epilogue_params;
|
||||
|
||||
epilogue_params.C = C;
|
||||
epilogue_params.D = D;
|
||||
epilogue_params.alpha = options.alpha;
|
||||
epilogue_params.beta = options.beta;
|
||||
|
||||
// get reference result
|
||||
cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
|
||||
|
||||
// compare_reference
|
||||
tensor_D.sync_host();
|
||||
bool passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
|
||||
|
||||
return passed;
|
||||
}
|
||||
|
||||
/// Execute a given example GEMM computation
|
||||
template <class Gemm>
|
||||
int run(Options &options) {
|
||||
initialize(options);
|
||||
|
||||
|
||||
// Instantiate CUTLASS kernel depending on templates
|
||||
Gemm gemm;
|
||||
|
||||
// Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
|
||||
auto arguments = args_from_options<Gemm>(options);
|
||||
|
||||
// Using the arguments, query for extra workspace required for matrix multiplication computation
|
||||
size_t workspace_size = Gemm::get_workspace_size(arguments);
|
||||
|
||||
// Allocate workspace memory
|
||||
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
|
||||
|
||||
|
||||
// Check if the problem size is supported or not
|
||||
CUTLASS_CHECK(gemm.can_implement(arguments));
|
||||
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
|
||||
|
||||
|
||||
// Correctness / Warmup iteration
|
||||
CUTLASS_CHECK(gemm.run());
|
||||
|
||||
Result result;
|
||||
if (!options.skip_verification) {
|
||||
// Check if output from CUTLASS kernel and reference kernel are equal or not
|
||||
result.passed = verify(options);
|
||||
|
||||
std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
|
||||
|
||||
if (!result.passed) {
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
// Run profiling loop
|
||||
if (options.iterations > 0) {
|
||||
GpuTimer timer;
|
||||
timer.start();
|
||||
for (int iter = 0; iter < options.iterations; ++iter) {
|
||||
CUTLASS_CHECK(gemm.run());
|
||||
}
|
||||
timer.stop();
|
||||
|
||||
// Compute average runtime and GFLOPs.
|
||||
float elapsed_ms = timer.elapsed_millis();
|
||||
result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
|
||||
result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
|
||||
|
||||
std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
|
||||
std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
|
||||
std::cout << " GFLOPS: " << result.gflops << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
|
||||
// or CUDA 12.9 or higher for SM121 support.
|
||||
// Must have compute capability at least 120.
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
|
||||
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
|
||||
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaDeviceProp props;
|
||||
int current_device_id;
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
|
||||
std::cerr << "This example requires a GPU with compute capability 120a or 121a)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Parse options
|
||||
//
|
||||
|
||||
Options options;
|
||||
|
||||
options.parse(argc, args);
|
||||
|
||||
if (options.help) {
|
||||
options.print_usage(std::cout) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Run
|
||||
//
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
printf("Running kernel with Cooperative kernel schedule:\n");
|
||||
run<CooperativeGemm>(options);
|
||||
|
||||
printf("Running kernel with Pingpong kernel schedule:\n");
|
||||
run<PingpongGemm>(options);
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -0,0 +1,678 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
\brief An FP8 groupwise scaled grouped GEMM example for the NVIDIA Blackwell SM120 architecture using CUTLASS.
|
||||
|
||||
This example demonstrates an implementation of Grouped GEMM using a TMA + Blackwell SM120 TensorOp-based warp-specialized kernel
|
||||
for FP8 with per-group:1x128x128 FP32 scaling factors.
|
||||
In this example, M, N, and K are fixed across groups.
|
||||
As RTX 50 series GPUs do not support runtime scaling block sizes, all groups share the same block scaling size.
|
||||
For this example all scheduling work is performed on the device, utilizing the device-side modification of TMA descriptors
|
||||
to move between groups/problem_count (represented by groups).
|
||||
https://docs.nvidia.com/cuda/cuda-c-programming-guide/#encoding-a-tensor-map-on-device
|
||||
|
||||
To run this example:
|
||||
|
||||
$ ./examples/87_blackwell_geforce_gemm_blockwise/87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise --m=2048 --n=2048 --k=2048 --groups=10
|
||||
|
||||
The above example command makes all 10 groups to be sized at the given m, n, k sizes.
|
||||
Same applies for alpha and beta values that are randomized across the different groups.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/tensor_ref.h"
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/dispatch_policy.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/kernel/tile_scheduler_params.h"
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
#include "cutlass/util/distribution.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/packed_stride.hpp"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/reference/host/tensor_copy.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_norm.h"
|
||||
|
||||
#include "cutlass/util/reference/host/gett.hpp"
|
||||
|
||||
#include "helper.h"
|
||||
|
||||
#include "./utils.h"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
using ProblemShape = cutlass::gemm::GroupProblemShape<Shape<int,int,int>>;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM kernel configurations
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::float_e4m3_t; // Element type for A matrix operand
|
||||
using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::float_e4m3_t; // Element type for B matrix operand
|
||||
using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// C/D matrix configuration
|
||||
using ElementC = cutlass::bfloat16_t; // Element type for C and D matrix operands
|
||||
using LayoutC = cutlass::layout::RowMajor; // Layout type for C and D matrix operands
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
|
||||
using ElementD = ElementC;
|
||||
using LayoutD = LayoutC;
|
||||
constexpr int AlignmentD = AlignmentC;
|
||||
|
||||
// MMA type
|
||||
using ElementAccumulator = float; // Element Accumulator will also be our scale factor type
|
||||
using ElementCompute = float;
|
||||
|
||||
|
||||
// MMA and Cluster Tile Shapes
|
||||
// Shape of the tile
|
||||
using MmaTileShape_MNK = Shape<_128,_128,_128>;
|
||||
// Shape of the threadblocks in a cluster
|
||||
using ClusterShape_MNK = Shape<_1,_1,_1>;
|
||||
|
||||
// Scaling Factors
|
||||
using ElementSF = ElementAccumulator;
|
||||
|
||||
constexpr int ScaleGranularityM = 1;
|
||||
constexpr int ScaleGranularityN = 128;
|
||||
constexpr int ScaleGranularityK = 128;
|
||||
using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<ScaleGranularityM, ScaleGranularityN, ScaleGranularityK>;
|
||||
|
||||
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); // Layout type for SFA matrix operand
|
||||
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); // Layout type for SFB matrix operand
|
||||
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementCompute,
|
||||
ElementC, LayoutC *, AlignmentC,
|
||||
ElementD, LayoutD *, AlignmentD,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm120, cutlass::arch::OpClassTensorOp,
|
||||
ElementA, cute::tuple<LayoutA *, LayoutSFA *>, AlignmentA,
|
||||
ElementB, cute::tuple<LayoutB *, LayoutSFB *>, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape_MNK, ClusterShape_MNK,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelScheduleSm120Blockwise
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue,
|
||||
void>; // Default to ClusterLaunchControl (CLC) based tile scheduler
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
using StrideA = typename Gemm::GemmKernel::InternalStrideA;
|
||||
using StrideB = typename Gemm::GemmKernel::InternalStrideB;
|
||||
using StrideC = typename Gemm::GemmKernel::InternalStrideC;
|
||||
using StrideD = typename Gemm::GemmKernel::InternalStrideD;
|
||||
static_assert(cute::is_same_v<typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA, LayoutSFA>);
|
||||
static_assert(cute::is_same_v<typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB, LayoutSFB>);
|
||||
|
||||
|
||||
/// Initialization
|
||||
uint64_t seed;
|
||||
|
||||
std::vector<StrideA> stride_A_host;
|
||||
std::vector<StrideB> stride_B_host;
|
||||
std::vector<StrideC> stride_C_host;
|
||||
std::vector<StrideD> stride_D_host;
|
||||
std::vector<LayoutSFA> layout_SFA_host;
|
||||
std::vector<LayoutSFB> layout_SFB_host;
|
||||
|
||||
std::vector<ElementAccumulator> alpha_host;
|
||||
std::vector<ElementAccumulator> beta_host;
|
||||
|
||||
using HostTensorA = cutlass::HostTensor<ElementA, cutlass::layout::PackedVectorLayout>;
|
||||
using HostTensorB = cutlass::HostTensor<ElementB, cutlass::layout::PackedVectorLayout>;
|
||||
using HostTensorC = cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout>;
|
||||
using HostTensorD = cutlass::HostTensor<Gemm::EpilogueOutputOp::ElementOutput, cutlass::layout::PackedVectorLayout>;
|
||||
using HostTensorSFA = cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout>;
|
||||
using HostTensorSFB = cutlass::HostTensor<ElementAccumulator, cutlass::layout::PackedVectorLayout>;
|
||||
|
||||
std::vector<HostTensorA> block_A;
|
||||
std::vector<HostTensorB> block_B;
|
||||
std::vector<HostTensorC> block_C;
|
||||
std::vector<HostTensorD> block_D;
|
||||
std::vector<HostTensorD> block_ref_D;
|
||||
std::vector<HostTensorSFA> block_SFA;
|
||||
std::vector<HostTensorSFB> block_SFB;
|
||||
|
||||
cutlass::DeviceAllocation<typename ProblemShape::UnderlyingProblemShape> problem_sizes;
|
||||
cutlass::DeviceAllocation<ElementA const*> ptr_A;
|
||||
cutlass::DeviceAllocation<ElementB const*> ptr_B;
|
||||
cutlass::DeviceAllocation<ElementSF const*> ptr_SFA;
|
||||
cutlass::DeviceAllocation<ElementSF const*> ptr_SFB;
|
||||
cutlass::DeviceAllocation<ElementC const*> ptr_C;
|
||||
cutlass::DeviceAllocation<ElementD *> ptr_D;
|
||||
cutlass::DeviceAllocation<ElementD *> ptr_ref_D;
|
||||
|
||||
cutlass::DeviceAllocation<StrideA> stride_A;
|
||||
cutlass::DeviceAllocation<StrideB> stride_B;
|
||||
cutlass::DeviceAllocation<StrideC> stride_C;
|
||||
cutlass::DeviceAllocation<StrideD> stride_D;
|
||||
cutlass::DeviceAllocation<LayoutSFA> layout_SFA;
|
||||
cutlass::DeviceAllocation<LayoutSFB> layout_SFB;
|
||||
|
||||
cutlass::DeviceAllocation<ElementAccumulator*> alpha_device;
|
||||
cutlass::DeviceAllocation<ElementAccumulator*> beta_device;
|
||||
cutlass::DeviceAllocation<ElementAccumulator> block_alpha;
|
||||
cutlass::DeviceAllocation<ElementAccumulator> block_beta;
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Testbed utility types
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
|
||||
// Command line options parsing
|
||||
struct Options {
|
||||
|
||||
bool help = false;
|
||||
bool skip_verification = false;
|
||||
|
||||
float alpha = 1.f, beta = 0.f;
|
||||
int iterations = 1000;
|
||||
int m = 1024, n = 512, k = 1024, l = 1, groups = 10;
|
||||
std::vector<typename ProblemShape::UnderlyingProblemShape> problem_sizes_host;
|
||||
RasterOrderOptions raster_order = RasterOrderOptions::AlongN;
|
||||
|
||||
// Parses the command line
|
||||
void parse(int argc, char const **args) {
|
||||
cutlass::CommandLine cmd(argc, args);
|
||||
|
||||
if (cmd.check_cmd_line_flag("help")) {
|
||||
help = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (cmd.check_cmd_line_flag("skip-verification")) {
|
||||
skip_verification = true;
|
||||
}
|
||||
|
||||
cmd.get_cmd_line_argument("m", m);
|
||||
cmd.get_cmd_line_argument("n", n);
|
||||
cmd.get_cmd_line_argument("k", k);
|
||||
cmd.get_cmd_line_argument("groups", groups);
|
||||
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
|
||||
cmd.get_cmd_line_argument("beta", beta, 0.f);
|
||||
cmd.get_cmd_line_argument("iterations", iterations);
|
||||
|
||||
char raster_char;
|
||||
cmd.get_cmd_line_argument("raster", raster_char, 'N');
|
||||
|
||||
if (raster_char == 'N' || raster_char == 'n') {
|
||||
raster_order = RasterOrderOptions::AlongN;
|
||||
} else if (raster_char == 'M' || raster_char == 'm') {
|
||||
raster_order = RasterOrderOptions::AlongM;
|
||||
}
|
||||
|
||||
for (int i = 0; i < groups; ++i) {
|
||||
problem_sizes_host.push_back({m, n, k});
|
||||
}
|
||||
}
|
||||
|
||||
/// Prints the usage statement.
|
||||
std::ostream & print_usage(std::ostream &out) const {
|
||||
|
||||
out << "87c_blackwell_geforce_grouped_gemm_groupwise\n\n"
|
||||
<< " Blackwell FP8 GEMM with Groupwise Scaling using a Warp Specialized kernel.\n\n"
|
||||
<< "Options:\n\n"
|
||||
<< " --help If specified, displays this usage statement\n\n"
|
||||
<< " --m=<int> Sets the M extent of the GEMM\n"
|
||||
<< " --n=<int> Sets the N extent of the GEMM\n"
|
||||
<< " --k=<int> Sets the K extent of the GEMM\n"
|
||||
<< " --groups=<int> Sets the number of individual GEMM problems for Grouped GEMM\n"
|
||||
<< " --alpha=<f32> Epilogue scalar alpha\n"
|
||||
<< " --beta=<f32> Epilogue scalar beta\n"
|
||||
<< " --iterations=<int> Number of profiling iterations to perform.\n\n"
|
||||
<< " --skip-verification Skip verification.\n\n";
|
||||
|
||||
out
|
||||
<< "\n\nExamples:\n\n"
|
||||
<< "$ " << "87c_blackwell_geforce_grouped_gemm_groupwise" << " --m=1024 --n=512 --k=1024 --groups=8 --alpha=2 --beta=0.707 \n\n";
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/// Compute performance in GFLOP/s
|
||||
double gflops(double runtime_s) const {
|
||||
// Two flops per multiply-add
|
||||
uint64_t flop = uint64_t(2) * m * n * k * groups;
|
||||
double gflop = double(flop) / double(1.0e9);
|
||||
return gflop / runtime_s;
|
||||
}
|
||||
};
|
||||
|
||||
/// Result structure
|
||||
struct Result {
|
||||
double avg_runtime_ms;
|
||||
double gflops;
|
||||
cutlass::Status status;
|
||||
cudaError_t error;
|
||||
bool passed;
|
||||
|
||||
Result(
|
||||
double avg_runtime_ms = 0,
|
||||
double gflops = 0,
|
||||
cutlass::Status status = cutlass::Status::kSuccess,
|
||||
cudaError_t error = cudaSuccess)
|
||||
:
|
||||
avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
|
||||
{}
|
||||
|
||||
};
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM setup and evaluation
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Initialize operands to be used in the GEMM and reference GEMM
|
||||
void initialize(const Options &options) {
|
||||
using namespace cute;
|
||||
|
||||
std::vector<ElementA *> ptr_A_host(options.groups);
|
||||
std::vector<ElementB *> ptr_B_host(options.groups);
|
||||
std::vector<ElementSF *> ptr_SFA_host(options.groups);
|
||||
std::vector<ElementSF *> ptr_SFB_host(options.groups);
|
||||
std::vector<ElementC *> ptr_C_host(options.groups);
|
||||
std::vector<ElementD *> ptr_D_host(options.groups);
|
||||
std::vector<ElementAccumulator *> ptr_alpha_host(options.groups);
|
||||
std::vector<ElementAccumulator *> ptr_beta_host(options.groups);
|
||||
|
||||
block_alpha.reset(options.groups);
|
||||
block_beta.reset(options.groups);
|
||||
for (int i = 0; i < options.groups; ++i) {
|
||||
auto problem = options.problem_sizes_host.at(i);
|
||||
auto [M, N, K] = problem;
|
||||
|
||||
auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {M, K, 1});
|
||||
auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {N, K, 1});
|
||||
auto stride_C = cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1});
|
||||
auto stride_D = cutlass::make_cute_packed_stride(StrideC{}, {M, N, 1});
|
||||
|
||||
auto layout_A = make_layout(make_shape(M, K, 1), stride_A);
|
||||
auto layout_B = make_layout(make_shape(N, K, 1), stride_B);
|
||||
auto layout_C = make_layout(make_shape(M, N, 1), stride_C);
|
||||
auto layout_D = make_layout(make_shape(M, N, 1), stride_D);
|
||||
|
||||
auto layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(M, N, K, 1));
|
||||
auto layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(M, N, K, 1));
|
||||
|
||||
stride_A_host.push_back(stride_A);
|
||||
stride_B_host.push_back(stride_B);
|
||||
layout_SFA_host.push_back(layout_SFA);
|
||||
layout_SFB_host.push_back(layout_SFB);
|
||||
stride_C_host.push_back(stride_C);
|
||||
stride_D_host.push_back(stride_D);
|
||||
|
||||
block_A.push_back(HostTensorA(cutlass::make_Coord(size(layout_A))));
|
||||
block_B.push_back(HostTensorB(cutlass::make_Coord(size(layout_B))));
|
||||
block_C.push_back(HostTensorC(cutlass::make_Coord(size(layout_C))));
|
||||
block_D.push_back(HostTensorD(cutlass::make_Coord(size(layout_D))));
|
||||
block_SFA.push_back(HostTensorSFA(cutlass::make_Coord(size(filter_zeros(layout_SFA)))));
|
||||
block_SFB.push_back(HostTensorSFB(cutlass::make_Coord(size(filter_zeros(layout_SFB)))));
|
||||
block_ref_D.push_back(HostTensorD(cutlass::make_Coord(size(layout_D))));
|
||||
}
|
||||
|
||||
for (int i = 0; i < options.groups; ++i) {
|
||||
initialize_tensor(block_A.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2022);
|
||||
initialize_tensor(block_B.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2023);
|
||||
initialize_tensor(block_C.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2024);
|
||||
initialize_tensor(block_SFA.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2025);
|
||||
initialize_tensor(block_SFB.at(i).host_view(), cutlass::Distribution::Uniform, seed + 2026);
|
||||
|
||||
block_A.at(i).sync_device();
|
||||
block_B.at(i).sync_device();
|
||||
block_C.at(i).sync_device();
|
||||
block_SFA.at(i).sync_device();
|
||||
block_SFB.at(i).sync_device();
|
||||
|
||||
ptr_A_host.at(i) = block_A.at(i).device_data();
|
||||
ptr_B_host.at(i) = block_B.at(i).device_data();
|
||||
ptr_C_host.at(i) = block_C.at(i).device_data();
|
||||
ptr_D_host.at(i) = block_D.at(i).device_data();
|
||||
ptr_SFA_host.at(i) = block_SFA.at(i).device_data();
|
||||
ptr_SFB_host.at(i) = block_SFB.at(i).device_data();
|
||||
|
||||
alpha_host.push_back((options.alpha == std::numeric_limits<float>::max()) ? static_cast<ElementAccumulator>((rand() % 5) + 1) : options.alpha);
|
||||
beta_host.push_back((options.beta == std::numeric_limits<float>::max()) ? static_cast<ElementAccumulator>(rand() % 5) : options.beta);
|
||||
ptr_alpha_host.at(i) = block_alpha.get() + i;
|
||||
ptr_beta_host.at(i) = block_beta.get() + i;
|
||||
}
|
||||
|
||||
problem_sizes.reset(options.groups);
|
||||
problem_sizes.copy_from_host(options.problem_sizes_host.data());
|
||||
|
||||
ptr_A.reset(options.groups);
|
||||
ptr_A.copy_from_host(ptr_A_host.data());
|
||||
|
||||
ptr_B.reset(options.groups);
|
||||
ptr_B.copy_from_host(ptr_B_host.data());
|
||||
|
||||
ptr_SFA.reset(options.groups);
|
||||
ptr_SFA.copy_from_host(ptr_SFA_host.data());
|
||||
|
||||
ptr_SFB.reset(options.groups);
|
||||
ptr_SFB.copy_from_host(ptr_SFB_host.data());
|
||||
|
||||
ptr_C.reset(options.groups);
|
||||
ptr_C.copy_from_host(ptr_C_host.data());
|
||||
|
||||
ptr_D.reset(options.groups);
|
||||
ptr_D.copy_from_host(ptr_D_host.data());
|
||||
|
||||
stride_A.reset(options.groups);
|
||||
stride_A.copy_from_host(stride_A_host.data());
|
||||
|
||||
stride_B.reset(options.groups);
|
||||
stride_B.copy_from_host(stride_B_host.data());
|
||||
|
||||
layout_SFA.reset(options.groups);
|
||||
layout_SFA.copy_from_host(layout_SFA_host.data());
|
||||
|
||||
layout_SFB.reset(options.groups);
|
||||
layout_SFB.copy_from_host(layout_SFB_host.data());
|
||||
|
||||
stride_C.reset(options.groups);
|
||||
stride_C.copy_from_host(stride_C_host.data());
|
||||
|
||||
stride_D.reset(options.groups);
|
||||
stride_D.copy_from_host(stride_D_host.data());
|
||||
|
||||
alpha_device.reset(options.groups);
|
||||
alpha_device.copy_from_host(ptr_alpha_host.data());
|
||||
beta_device.reset(options.groups);
|
||||
beta_device.copy_from_host(ptr_beta_host.data());
|
||||
|
||||
block_alpha.copy_from_host(alpha_host.data());
|
||||
block_beta.copy_from_host(beta_host.data());
|
||||
}
|
||||
|
||||
/// Populates a Gemm::Arguments structure from the given commandline options
|
||||
typename Gemm::Arguments args_from_options(const Options &options) {
|
||||
cutlass::KernelHardwareInfo hw_info;
|
||||
hw_info.device_id = 0;
|
||||
hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
|
||||
|
||||
typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
|
||||
scheduler.raster_order = options.raster_order;
|
||||
|
||||
typename Gemm::Arguments arguments;
|
||||
decltype(arguments.epilogue.thread) fusion_args;
|
||||
fusion_args.alpha_ptr = nullptr;
|
||||
fusion_args.beta_ptr = nullptr;
|
||||
|
||||
if (options.alpha != std::numeric_limits<float>::max()) {
|
||||
fusion_args.alpha = options.alpha;
|
||||
fusion_args.alpha_ptr_array = nullptr;
|
||||
fusion_args.dAlpha = {_0{}, _0{}, 0};
|
||||
} else {
|
||||
fusion_args.alpha = 0;
|
||||
fusion_args.alpha_ptr_array = alpha_device.get();
|
||||
fusion_args.dAlpha = {_0{}, _0{}, 1};
|
||||
}
|
||||
|
||||
if (options.beta != std::numeric_limits<float>::max()) {
|
||||
fusion_args.beta = options.beta;
|
||||
fusion_args.beta_ptr_array = nullptr;
|
||||
fusion_args.dBeta = {_0{}, _0{}, 0};
|
||||
} else {
|
||||
fusion_args.beta = 0;
|
||||
fusion_args.beta_ptr_array = beta_device.get();
|
||||
fusion_args.dBeta = {_0{}, _0{}, 1};
|
||||
}
|
||||
|
||||
arguments = {
|
||||
cutlass::gemm::GemmUniversalMode::kGrouped,
|
||||
{options.groups, problem_sizes.get(), options.problem_sizes_host.data()},
|
||||
{ptr_A.get(), stride_A.get(),
|
||||
ptr_B.get(), stride_B.get(),
|
||||
ptr_SFA.get(), layout_SFA.get(),
|
||||
ptr_SFB.get(), layout_SFB.get()},
|
||||
{
|
||||
fusion_args,
|
||||
ptr_C.get(), stride_C.get(),
|
||||
ptr_D.get(), stride_D.get()
|
||||
},
|
||||
hw_info, scheduler
|
||||
};
|
||||
|
||||
return arguments;
|
||||
}
|
||||
|
||||
bool verify(const Options &options) {
|
||||
//
|
||||
// Compute reference output
|
||||
//
|
||||
|
||||
bool passed = true;
|
||||
|
||||
for (int i = 0; i < options.groups; ++i) {
|
||||
auto problem = options.problem_sizes_host.at(i);
|
||||
auto [M, N, K] = problem;
|
||||
|
||||
auto A = cute::make_tensor(block_A.at(i).host_data(),
|
||||
cute::make_layout(cute::make_shape(M, K, 1), stride_A_host.at(i)));
|
||||
auto B = cute::make_tensor(block_B.at(i).host_data(),
|
||||
cute::make_layout(cute::make_shape(N, K, 1), stride_B_host.at(i)));
|
||||
auto C = cute::make_tensor(block_C.at(i).host_data(),
|
||||
cute::make_layout(cute::make_shape(M, N, 1), stride_C_host.at(i)));
|
||||
auto D = cute::make_tensor(block_ref_D.at(i).host_data(),
|
||||
cute::make_layout(cute::make_shape(M, N, 1), stride_D_host.at(i)));
|
||||
auto SFA = cute::make_tensor(block_SFA.at(i).host_data(), layout_SFA_host.at(i));
|
||||
auto SFB = cute::make_tensor(block_SFB.at(i).host_data(), layout_SFB_host.at(i));
|
||||
|
||||
cutlass::reference::host::GettBlockScalingMainloopParams<
|
||||
ElementAccumulator,
|
||||
decltype(A),
|
||||
decltype(SFA),
|
||||
decltype(B),
|
||||
decltype(SFB)
|
||||
> mainloop_params{A, SFA, B, SFB};
|
||||
|
||||
cutlass::reference::host::GettEpilogueParams<
|
||||
ElementAccumulator,
|
||||
ElementAccumulator,
|
||||
ElementAccumulator,
|
||||
ElementCompute,
|
||||
decltype(C),
|
||||
decltype(D)
|
||||
> epilogue_params;
|
||||
|
||||
epilogue_params.C = C;
|
||||
epilogue_params.D = D;
|
||||
epilogue_params.alpha = alpha_host.at(i);
|
||||
epilogue_params.beta = beta_host.at(i);
|
||||
|
||||
cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
|
||||
|
||||
|
||||
block_D.at(i).sync_host();
|
||||
passed &= cutlass::reference::host::TensorEquals(block_ref_D.at(i).host_view(), block_D.at(i).host_view());
|
||||
}
|
||||
|
||||
return passed;
|
||||
}
|
||||
|
||||
/// Execute a given example GEMM computation
|
||||
template <class Gemm>
|
||||
int run(Options &options) {
|
||||
|
||||
// Instantiate CUTLASS kernel depending on templates
|
||||
Gemm gemm;
|
||||
|
||||
// Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
|
||||
auto arguments = args_from_options(options);
|
||||
|
||||
// Using the arguments, query for extra workspace required for matrix multiplication computation
|
||||
size_t workspace_size = Gemm::get_workspace_size(arguments);
|
||||
|
||||
// Allocate workspace memory
|
||||
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
|
||||
|
||||
|
||||
// Check if the problem size is supported or not
|
||||
CUTLASS_CHECK(gemm.can_implement(arguments));
|
||||
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
CUTLASS_CHECK(gemm.initialize(arguments, workspace.get()));
|
||||
|
||||
|
||||
// Correctness / Warmup iteration
|
||||
CUTLASS_CHECK(gemm.run());
|
||||
|
||||
Result result;
|
||||
if (!options.skip_verification) {
|
||||
// Check if output from CUTLASS kernel and reference kernel are equal or not
|
||||
result.passed = verify(options);
|
||||
|
||||
std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
|
||||
|
||||
if (!result.passed) {
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
// Run profiling loop
|
||||
if (options.iterations > 0) {
|
||||
GpuTimer timer;
|
||||
timer.start();
|
||||
for (int iter = 0; iter < options.iterations; ++iter) {
|
||||
CUTLASS_CHECK(gemm.run());
|
||||
}
|
||||
timer.stop();
|
||||
|
||||
// Compute average runtime and GFLOPs.
|
||||
float elapsed_ms = timer.elapsed_millis();
|
||||
result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
|
||||
result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
|
||||
|
||||
std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << " " << options.groups << " Groups" << std::endl;
|
||||
std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
|
||||
std::cout << " GFLOPS: " << result.gflops << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
// CUTLASS must be compiled with CUDA 12.8 or higher Toolkit for SM120 support,
|
||||
// or CUDA 12.9 or higher for SM121 support.
|
||||
// Must have compute capability at least 120.
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 8)) {
|
||||
std::cerr << "This example requires CUDA 12.8 or newer for SM120 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#elif defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
|
||||
std::cerr << "This example requires CUDA 12.9 or newer for SM121 support." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaDeviceProp props;
|
||||
int current_device_id;
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
if (!(props.major == 12 && (props.minor == 0 || props.minor == 1))) {
|
||||
std::cerr << "This example requires a GPU with compute capability 120a or 121a)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Parse options
|
||||
//
|
||||
|
||||
Options options;
|
||||
|
||||
options.parse(argc, args);
|
||||
|
||||
if (options.help) {
|
||||
options.print_usage(std::cout) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Run
|
||||
//
|
||||
#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
initialize(options);
|
||||
run<Gemm>(options);
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM121_SUPPORTED)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
47
examples/87_blackwell_geforce_gemm_blockwise/CMakeLists.txt
Normal file
47
examples/87_blackwell_geforce_gemm_blockwise/CMakeLists.txt
Normal file
@ -0,0 +1,47 @@
|
||||
|
||||
# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES "120a|121a")
|
||||
cutlass_example_add_executable(
|
||||
87a_blackwell_geforce_fp8_bf16_gemm_blockwise
|
||||
87a_blackwell_geforce_fp8_bf16_gemm_blockwise.cu
|
||||
)
|
||||
|
||||
cutlass_example_add_executable(
|
||||
87b_blackwell_geforce_fp8_bf16_gemm_groupwise
|
||||
87b_blackwell_geforce_fp8_bf16_gemm_groupwise.cu
|
||||
)
|
||||
|
||||
cutlass_example_add_executable(
|
||||
87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise
|
||||
87c_blackwell_geforce_fp8_bf16_grouped_gemm_groupwise.cu
|
||||
)
|
||||
|
||||
endif()
|
||||
83
examples/87_blackwell_geforce_gemm_blockwise/utils.h
Normal file
83
examples/87_blackwell_geforce_gemm_blockwise/utils.h
Normal file
@ -0,0 +1,83 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/// Helper to initialize a block of device data
|
||||
template <class Element, class Layout>
|
||||
bool initialize_tensor(
|
||||
cutlass::TensorView<Element, Layout> view,
|
||||
cutlass::Distribution::Kind dist_kind,
|
||||
uint64_t seed) {
|
||||
|
||||
if (dist_kind == cutlass::Distribution::Uniform) {
|
||||
|
||||
double scope_max, scope_min;
|
||||
int bits_input = cutlass::sizeof_bits<Element>::value;
|
||||
|
||||
if (bits_input == 1) {
|
||||
scope_max = 2;
|
||||
scope_min = 0;
|
||||
}
|
||||
else if (bits_input <= 8) {
|
||||
scope_max = 2;
|
||||
scope_min = -2;
|
||||
}
|
||||
else if (bits_input == 16) {
|
||||
scope_max = 5;
|
||||
scope_min = -5;
|
||||
}
|
||||
else {
|
||||
scope_max = 8;
|
||||
scope_min = -8;
|
||||
}
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
view, seed, scope_max, scope_min, 0);
|
||||
}
|
||||
else if (dist_kind == cutlass::Distribution::AllZeros) {
|
||||
cutlass::reference::host::TensorFill(view);
|
||||
}
|
||||
else if (dist_kind == cutlass::Distribution::Identity) {
|
||||
|
||||
cutlass::reference::host::TensorFillIdentity(view);
|
||||
}
|
||||
else if (dist_kind == cutlass::Distribution::Gaussian) {
|
||||
|
||||
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
|
||||
}
|
||||
else if (dist_kind == cutlass::Distribution::Sequential) {
|
||||
cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
|
||||
}
|
||||
else {
|
||||
throw std::runtime_error("Not implementated.");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
545
examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm.cu
Normal file
545
examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm.cu
Normal file
@ -0,0 +1,545 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
\brief A GEMM example using CUTLASS for the NVIDIA Blackwell SM103 architecture.
|
||||
|
||||
This example demonstrates a simple way to instantiate and run a blockscaled 3xFP4 GEMM on the NVIDIA Blackwell SM103 architecture.
|
||||
|
||||
Usage:
|
||||
|
||||
$ ./examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm --m=2048 --n=2048 --k=2048
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/tensor_ref.h"
|
||||
#include "cutlass/epilogue/thread/linear_combination.h"
|
||||
#include "cutlass/gemm/dispatch_policy.hpp"
|
||||
#include "cutlass/gemm/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
#include "cutlass/detail/sm100_blockscaled_layout.hpp"
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/gemm_universal.hpp"
|
||||
#include "cutlass/gemm/kernel/tile_scheduler_params.h"
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
#include "cutlass/util/distribution.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/packed_stride.hpp"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
#include "cutlass/util/reference/device/gemm.h"
|
||||
#include "cutlass/util/reference/device/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/reference/host/gett.hpp"
|
||||
#include "cutlass/util/reference/host/tensor_norm.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "helper.h"
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM kernel configurations
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// A matrix configuration
|
||||
using ElementA = cutlass::float_e2m1_t; // Element type for A matrix operand
|
||||
using ElementSFA = cutlass::float_ue4m3_t;
|
||||
using LayoutATag = cutlass::layout::RowMajor; // Layout type for A matrix operand
|
||||
constexpr int AlignmentA = 32; // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// B matrix configuration
|
||||
using ElementB = cutlass::float_e2m1_t; // Element type for A matrix operand
|
||||
using ElementSFB = cutlass::float_ue4m3_t;
|
||||
using LayoutBTag = cutlass::layout::ColumnMajor; // Layout type for B matrix operand
|
||||
constexpr int AlignmentB = 32; // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
|
||||
|
||||
// C/D matrix configuration
|
||||
using ElementD = cutlass::bfloat16_t; // Element type for D matrix operand
|
||||
using ElementC = cutlass::bfloat16_t; // Element type for C matrix operand
|
||||
using LayoutCTag = cutlass::layout::RowMajor; // Layout type for C matrix operand
|
||||
using LayoutDTag = cutlass::layout::RowMajor; // Layout type for D matrix operand
|
||||
constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value; // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
|
||||
// Kernel functional config
|
||||
using ElementAccumulator = float; // Element type for internal accumulation
|
||||
using ArchTag = cutlass::arch::Sm103; // Tag indicating the minimum SM that supports the intended feature
|
||||
using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp; // Operator class tag
|
||||
|
||||
// using ElementD = cutlass::float_e2m1_t; // Enable for SF Output // Element type for D matrix operands
|
||||
|
||||
// Kernel Perf config
|
||||
using MmaTileShape = cute::Shape<cute::_128, cute::_128, Int<768>>; // MMA's tile size
|
||||
using ClusterShape = cute::Shape<cute::_2, cute::_4, cute::_1>; // Shape of the threadblocks in a cluster
|
||||
|
||||
// Epilogue fusion operator
|
||||
using EpilogueFusionOp = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAccumulator, ElementAccumulator,
|
||||
ElementC, LayoutCTag, AlignmentC,
|
||||
ElementD, LayoutDTag, AlignmentD,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm,
|
||||
EpilogueFusionOp
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
ArchTag, OperatorClass,
|
||||
cute::tuple<ElementA,ElementSFA>, LayoutATag, AlignmentA,
|
||||
cute::tuple<ElementB,ElementSFB>, LayoutBTag, AlignmentB,
|
||||
ElementAccumulator,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103 // Kernel schedule policy. Auto or using targeted scheduling policy
|
||||
>::CollectiveOp;
|
||||
|
||||
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
|
||||
Shape<int,int,int,int>, // Indicates ProblemShape
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue>;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
// Reference device GEMM implementation type
|
||||
using StrideA = typename Gemm::GemmKernel::StrideA;
|
||||
using LayoutA = decltype(cute::make_layout(make_shape(0,0,0), StrideA{}));
|
||||
using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA; // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
|
||||
using StrideB = typename Gemm::GemmKernel::StrideB;
|
||||
using LayoutB = decltype(cute::make_layout(make_shape(0,0,0), StrideB{}));
|
||||
using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB; // Scale Factor tensors have an interleaved layout. Bring Layout instead of stride.
|
||||
using StrideC = typename Gemm::GemmKernel::StrideC;
|
||||
using LayoutC = decltype(cute::make_layout(make_shape(0,0,0), StrideC{}));
|
||||
using StrideD = typename Gemm::GemmKernel::StrideD;
|
||||
using LayoutD = decltype(cute::make_layout(make_shape(0,0,0), StrideD{}));
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
/// Initialization
|
||||
StrideA stride_A;
|
||||
LayoutA layout_A;
|
||||
LayoutSFA layout_SFA;
|
||||
StrideB stride_B;
|
||||
LayoutB layout_B;
|
||||
LayoutSFB layout_SFB;
|
||||
StrideC stride_C;
|
||||
LayoutC layout_C;
|
||||
StrideD stride_D;
|
||||
LayoutD layout_D;
|
||||
uint64_t seed;
|
||||
|
||||
// The HostTensors are only used for allocating memory on host and device, and transferring data between host and device
|
||||
// Use cute::Tensor and cute::Layout for iterating thru the matrix elements
|
||||
cutlass::HostTensor<ElementA, cutlass::layout::PackedVectorLayout> block_A;
|
||||
cutlass::HostTensor<ElementSFA, cutlass::layout::PackedVectorLayout> block_SFA;
|
||||
cutlass::HostTensor<ElementB, cutlass::layout::PackedVectorLayout> block_B;
|
||||
cutlass::HostTensor<ElementSFB, cutlass::layout::PackedVectorLayout> block_SFB;
|
||||
cutlass::HostTensor<ElementC, cutlass::layout::PackedVectorLayout> block_C;
|
||||
// Output Tensor
|
||||
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_D;
|
||||
// Reference Output Tensor
|
||||
cutlass::HostTensor<ElementD, cutlass::layout::PackedVectorLayout> block_reference_D;
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
|
||||
|
||||
template <typename T>
|
||||
auto make_iterator(T* ptr) {
|
||||
return cute::recast_ptr<T>(ptr);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Testbed utility types
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Command line options parsing
|
||||
struct Options {
|
||||
|
||||
bool help;
|
||||
|
||||
float alpha, beta;
|
||||
int iterations;
|
||||
int m, n, k;
|
||||
int swizzle = 0;
|
||||
|
||||
Options():
|
||||
help(false),
|
||||
m(1024), n(1024), k(1024),
|
||||
alpha(1.f), beta(0.f),
|
||||
iterations(10),
|
||||
swizzle(0)
|
||||
{ }
|
||||
|
||||
// Parses the command line
|
||||
void parse(int argc, char const **args) {
|
||||
cutlass::CommandLine cmd(argc, args);
|
||||
|
||||
if (cmd.check_cmd_line_flag("help")) {
|
||||
help = true;
|
||||
return;
|
||||
}
|
||||
|
||||
cmd.get_cmd_line_argument("m", m);
|
||||
cmd.get_cmd_line_argument("n", n);
|
||||
cmd.get_cmd_line_argument("k", k);
|
||||
cmd.get_cmd_line_argument("alpha", alpha, 1.f);
|
||||
cmd.get_cmd_line_argument("beta", beta, 0.f);
|
||||
cmd.get_cmd_line_argument("iterations", iterations);
|
||||
cmd.get_cmd_line_argument("swizzle", swizzle);
|
||||
}
|
||||
|
||||
/// Prints the usage statement.
|
||||
std::ostream & print_usage(std::ostream &out) const {
|
||||
|
||||
out << "89_sm103_fp4_ultra_gemm\n\n"
|
||||
<< " Sm103 3xFP4 GEMM using a Warp Specialized kernel.\n\n"
|
||||
<< "Options:\n\n"
|
||||
<< " --help If specified, displays this usage statement\n\n"
|
||||
<< " --m=<int> Sets the M extent of the GEMM\n"
|
||||
<< " --n=<int> Sets the N extent of the GEMM\n"
|
||||
<< " --k=<int> Sets the K extent of the GEMM\n"
|
||||
<< " --alpha=<f32> Epilogue scalar alpha\n"
|
||||
<< " --beta=<f32> Epilogue scalar beta\n"
|
||||
<< " --swizzle=<int> Cluster rasterization swizzle\n"
|
||||
<< " --iterations=<int> Number of profiling iterations to perform.\n\n";
|
||||
|
||||
out << "\n\nExamples:\n\n"
|
||||
<< "$ " << "./examples/89_sm103_fp4_ultra_gemm/89_sm103_fp4_ultra_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/// Compute performance in GFLOP/s
|
||||
double gflops(double runtime_s) const
|
||||
{
|
||||
// Two flops per multiply-add
|
||||
uint64_t flop = uint64_t(2) * m * n * k;
|
||||
double gflop = double(flop) / double(1.0e9);
|
||||
return gflop / runtime_s;
|
||||
}
|
||||
};
|
||||
|
||||
/// Result structure
|
||||
struct Result
|
||||
{
|
||||
double avg_runtime_ms;
|
||||
double gflops;
|
||||
cutlass::Status status;
|
||||
cudaError_t error;
|
||||
bool passed;
|
||||
|
||||
Result(
|
||||
double avg_runtime_ms = 0,
|
||||
double gflops = 0,
|
||||
cutlass::Status status = cutlass::Status::kSuccess,
|
||||
cudaError_t error = cudaSuccess)
|
||||
:
|
||||
avg_runtime_ms(avg_runtime_ms), gflops(gflops), status(status), error(error), passed(false)
|
||||
{}
|
||||
|
||||
};
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// GEMM setup and evaluation
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Helper to initialize a block of device data
|
||||
template <typename Element, typename Layout>
|
||||
bool initialize_block(
|
||||
cutlass::TensorView<Element, Layout> view,
|
||||
uint64_t seed) {
|
||||
|
||||
double scope_max, scope_min;
|
||||
constexpr int bits_input = cutlass::sizeof_bits<Element>::value;
|
||||
|
||||
if constexpr (bits_input == 1) {
|
||||
scope_max = 2;
|
||||
scope_min = 0;
|
||||
}
|
||||
else if constexpr (bits_input <= 6) {
|
||||
scope_max = 2;
|
||||
scope_min = -2;
|
||||
}
|
||||
else if constexpr (bits_input <= 8) {
|
||||
if constexpr (cute::is_same_v<Element, cutlass::float_ue8m0_t>) {
|
||||
scope_max = 4;
|
||||
scope_min = 1;
|
||||
}
|
||||
else {
|
||||
scope_max = 1;
|
||||
scope_min = -1;
|
||||
}
|
||||
}
|
||||
else{
|
||||
scope_max = 4;
|
||||
scope_min = -4;
|
||||
}
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
view, seed, scope_max, scope_min, 0);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Initialize operands to be used in the GEMM and reference GEMM
|
||||
void initialize(const Options &options) {
|
||||
using namespace cute;
|
||||
// For SFA and SFB tensors layouts
|
||||
using Sm1xxBlkScaledConfig = typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
|
||||
|
||||
stride_A = cutlass::make_cute_packed_stride(StrideA{}, {options.m, options.k, 1});
|
||||
stride_B = cutlass::make_cute_packed_stride(StrideB{}, {options.n, options.k, 1});
|
||||
stride_C = cutlass::make_cute_packed_stride(StrideC{}, {options.m, options.n, 1});
|
||||
stride_D = cutlass::make_cute_packed_stride(StrideD{}, {options.m, options.n, 1});
|
||||
|
||||
layout_A = make_layout(make_shape(options.m, options.k, 1), stride_A);
|
||||
layout_B = make_layout(make_shape(options.n, options.k, 1), stride_B);
|
||||
layout_C = make_layout(make_shape(options.m, options.n, 1), stride_C);
|
||||
layout_D = make_layout(make_shape(options.m, options.n, 1), stride_D);
|
||||
layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(options.m, options.n, options.k, 1));
|
||||
layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(options.m, options.n, options.k, 1));
|
||||
|
||||
block_A.reset(cutlass::make_Coord(size(layout_A)));
|
||||
block_B.reset(cutlass::make_Coord(size(layout_B)));
|
||||
block_C.reset(cutlass::make_Coord(size(layout_C)));
|
||||
block_D.reset(cutlass::make_Coord(size(layout_D)));
|
||||
block_reference_D.reset(cutlass::make_Coord(size(layout_D)));
|
||||
block_SFA.reset(cutlass::make_Coord(size(filter_zeros(layout_SFA))));
|
||||
block_SFB.reset(cutlass::make_Coord(size(filter_zeros(layout_SFB))));
|
||||
|
||||
initialize_block(block_A.host_view(), seed + 2021);
|
||||
initialize_block(block_B.host_view(), seed + 2022);
|
||||
initialize_block(block_C.host_view(), seed + 2023);
|
||||
initialize_block(block_SFA.host_view(), seed + 2024);
|
||||
initialize_block(block_SFB.host_view(), seed + 2025);
|
||||
|
||||
block_A.sync_device();
|
||||
block_B.sync_device();
|
||||
block_C.sync_device();
|
||||
block_SFA.sync_device();
|
||||
block_SFB.sync_device();
|
||||
}
|
||||
|
||||
// Populates a Gemm::Arguments structure from the given commandline options
|
||||
typename Gemm::Arguments args_from_options(const Options &options)
|
||||
{
|
||||
typename Gemm::Arguments arguments {
|
||||
cutlass::gemm::GemmUniversalMode::kGemm,
|
||||
{options.m, options.n, options.k, 1},
|
||||
{ // Mainloop arguments
|
||||
block_A.device_data(), stride_A,
|
||||
block_B.device_data(), stride_B,
|
||||
block_SFA.device_data(), layout_SFA,
|
||||
block_SFB.device_data(), layout_SFB
|
||||
},
|
||||
{ // Epilogue arguments
|
||||
{options.alpha, options.beta},
|
||||
block_C.device_data(), stride_C,
|
||||
block_D.device_data(), stride_D
|
||||
}
|
||||
};
|
||||
|
||||
arguments.scheduler.max_swizzle_size = options.swizzle;
|
||||
return arguments;
|
||||
}
|
||||
|
||||
bool verify(const Options &options) {
|
||||
using namespace cute;
|
||||
// Create the arguments for host reference implementation
|
||||
Tensor tensor_A = make_tensor(make_iterator(block_A.host_data()), layout_A);
|
||||
Tensor tensor_SFA = make_tensor(block_SFA.host_data(), layout_SFA);
|
||||
Tensor tensor_B = make_tensor(make_iterator(block_B.host_data()), layout_B);
|
||||
Tensor tensor_SFB = make_tensor(block_SFB.host_data(), layout_SFB);
|
||||
|
||||
cutlass::reference::host::GettBlockScalingMainloopParams<
|
||||
ElementAccumulator, // ElementAccumulator
|
||||
decltype(tensor_A), // TensorA
|
||||
decltype(tensor_SFA), // TensorSfA
|
||||
decltype(tensor_B), // TensorB
|
||||
decltype(tensor_SFB) // TensorSfB
|
||||
> mainloop_params{tensor_A, tensor_SFA, tensor_B, tensor_SFB};
|
||||
|
||||
auto tensor_C = cute::make_tensor(make_iterator(block_C.host_data()), layout_C);
|
||||
auto tensor_D = cute::make_tensor(make_iterator(block_reference_D.host_data()), layout_D);
|
||||
|
||||
cutlass::reference::host::GettBlockScalingEpilogueParams<
|
||||
ElementAccumulator, // ElementScalar
|
||||
ElementAccumulator, // ElementAccumulator
|
||||
ElementAccumulator, // ElementCompute
|
||||
decltype(tensor_C), // TensorC
|
||||
decltype(tensor_D) // TensorD
|
||||
> epilogue_params{options.alpha, options.beta, tensor_C, tensor_D};
|
||||
|
||||
cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
|
||||
|
||||
// Comparison
|
||||
block_D.sync_host();
|
||||
bool passed = cutlass::reference::host::TensorEquals(block_reference_D.host_view(), block_D.host_view());
|
||||
passed &= (cutlass::reference::host::TensorNorm(block_reference_D.host_view()) > 0);
|
||||
passed &= (cutlass::reference::host::TensorNorm(block_D.host_view()) > 0);
|
||||
|
||||
return passed;
|
||||
}
|
||||
|
||||
/// Execute a given example GEMM computation
|
||||
template <typename Gemm>
|
||||
int run(Options &options)
|
||||
{
|
||||
initialize(options);
|
||||
|
||||
// Instantiate CUTLASS kernel depending on templates
|
||||
Gemm gemm;
|
||||
|
||||
// Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
|
||||
auto arguments = args_from_options(options);
|
||||
|
||||
// Using the arguments, query for extra workspace required for matrix multiplication computation
|
||||
size_t workspace_size = Gemm::get_workspace_size(arguments);
|
||||
|
||||
// Allocate workspace memory
|
||||
uint8_t* workspace = nullptr;
|
||||
cudaError_t status = cudaMalloc(&workspace, workspace_size);
|
||||
if (status != cudaSuccess) {
|
||||
std::cerr << "Failed to allocate workspace memory: " << cudaGetErrorString(status) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Check if the problem size is supported or not
|
||||
CUTLASS_CHECK(gemm.can_implement(arguments));
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
CUTLASS_CHECK(gemm.initialize(arguments, workspace));
|
||||
|
||||
// Correctness / Warmup iteration
|
||||
CUTLASS_CHECK(gemm.run());
|
||||
|
||||
// Free workspace memory
|
||||
cudaFree(workspace);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Check if output from CUTLASS kernel and reference kernel are equal or not
|
||||
Result result;
|
||||
result.passed = verify(options);
|
||||
|
||||
std::cout << " Disposition: " << (result.passed ? "Passed" : "Failed") << std::endl;
|
||||
|
||||
if (!result.passed) {
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// Run profiling loop
|
||||
if (options.iterations > 0)
|
||||
{
|
||||
GpuTimer timer;
|
||||
timer.start();
|
||||
for (int iter = 0; iter < options.iterations; ++iter) {
|
||||
CUTLASS_CHECK(gemm.initialize(arguments, workspace));
|
||||
CUTLASS_CHECK(gemm.run());
|
||||
}
|
||||
timer.stop();
|
||||
|
||||
// Compute average runtime and GFLOPs.
|
||||
float elapsed_ms = timer.elapsed_millis();
|
||||
result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
|
||||
result.gflops = options.gflops(result.avg_runtime_ms / 1000.0);
|
||||
|
||||
|
||||
std::cout << " Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
|
||||
std::cout << " Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
|
||||
std::cout << " GFLOPS: " << result.gflops << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
// CUTLASS must be compiled with CUDA 12.9 or higher Toolkit to run this example
|
||||
// and must have compute capability at least 100.
|
||||
if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 9)) {
|
||||
std::cerr << "This example requires CUDA 12.9 or newer." << std::endl;
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
|
||||
cudaDeviceProp props;
|
||||
int current_device_id;
|
||||
CUDA_CHECK(cudaGetDevice(¤t_device_id));
|
||||
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
|
||||
|
||||
if (!(props.major == 10 && props.minor == 3)) {
|
||||
std::cerr << "This example requires a GPU of NVIDIA's Blackwell architecture (compute capability 103)." << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Parse options
|
||||
//
|
||||
|
||||
Options options;
|
||||
|
||||
options.parse(argc, args);
|
||||
|
||||
if (options.help) {
|
||||
options.print_usage(std::cout) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Evaluate CUTLASS kernels
|
||||
//
|
||||
#if defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
|
||||
run<Gemm>(options);
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM103_SUPPORTED)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
38
examples/89_sm103_fp4_ultra_gemm/CMakeLists.txt
Normal file
38
examples/89_sm103_fp4_ultra_gemm/CMakeLists.txt
Normal file
@ -0,0 +1,38 @@
|
||||
|
||||
# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES 103a)
|
||||
|
||||
cutlass_example_add_executable(
|
||||
89_sm103_fp4_ultra_gemm
|
||||
89_sm103_fp4_ultra_gemm.cu
|
||||
)
|
||||
|
||||
endif()
|
||||
File diff suppressed because it is too large
Load Diff
42
examples/90_sm103_fp4_ultra_grouped_gemm/CMakeLists.txt
Normal file
42
examples/90_sm103_fp4_ultra_grouped_gemm/CMakeLists.txt
Normal file
@ -0,0 +1,42 @@
|
||||
# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
set(TEST_RANDOM_SMALL_GROUP --groups=3 --iterations=1) # Random problem sizes
|
||||
set(TEST_EPILOGUE_SMALL_GROUP --alpha=1.5 --beta=2.0 --groups=3 --iterations=1) # Random problem sizes
|
||||
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES 103a)
|
||||
|
||||
cutlass_example_add_executable(
|
||||
90_sm103_fp4_ultra_grouped_gemm
|
||||
90_sm103_fp4_ultra_grouped_gemm.cu
|
||||
TEST_COMMAND_OPTIONS
|
||||
TEST_RANDOM_SMALL_GROUP
|
||||
TEST_EPILOGUE_SMALL_GROUP
|
||||
)
|
||||
|
||||
endif()
|
||||
898
examples/91_fp4_gemv/91_fp4_gemv.cu
Normal file
898
examples/91_fp4_gemv/91_fp4_gemv.cu
Normal file
@ -0,0 +1,898 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
#include <cstdint> // uint64_t
|
||||
#include <cstdio>
|
||||
#include <cstdlib> // rand(), RAND_MAX
|
||||
#include <string> // std::stoi
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <float.h>
|
||||
#include <optional>
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
// clang-format off
|
||||
#include "cute/tensor.hpp" // FIX cute header file inclusion issue
|
||||
// clang-format on
|
||||
|
||||
#include "cute/arch/mma_sm100_desc.hpp" // cute::UMMA::Major
|
||||
#include "cute/numeric/numeric_types.hpp" // cute::sizeof_bits_v
|
||||
#include "cutlass/complex.h" // cutlass::ComplexTransform
|
||||
#include "cutlass/cutlass.h" // cutlass::Status
|
||||
#include "cutlass/detail/sm100_blockscaled_layout.hpp" // cutlass::detail::Sm1xxBlockScaledOutputConfig
|
||||
#include "cutlass/epilogue/thread/linear_combination.h" // cutlass::epilogue::thread::LinearCombination
|
||||
#include "cutlass/gemm/device/gemv_blockscaled.h" // cutlass::gemm::device::Gemv
|
||||
#include "cutlass/gemm/kernel/gemv_blockscaled.h" // cutlass::gemm::kernel::Gemv
|
||||
#include "cutlass/epilogue/threadblock/epilogue_with_scaling_factor.h" // cutlass::epilogue::threadblock::GemvEpilogueWithScalingFactor
|
||||
#include "cutlass/gemm_coord.h" // cutlass::GemmCoord
|
||||
#include "cutlass/layout/matrix.h" // cutlass::layout::Affine2Layout_Factory
|
||||
#include "cutlass/numeric_size.h" // cutlss::is_subbyte
|
||||
#include "cutlass/numeric_types.h"
|
||||
#include "cutlass/platform/platform.h" // cutlass::is_same_v
|
||||
#include "cutlass/util/device_memory.h" // cutlass::device_memory::allocation
|
||||
#include "cutlass/util/distribution.h" // cutlass::Distribution
|
||||
#include "cutlass/util/host_tensor.h" // cutlass::HostTensor
|
||||
#include "cutlass/util/packed_stride.hpp" // cutlass::make_cute_packed_stride
|
||||
#include "cutlass/util/reference/host/gemm_complex.h" // cutlass::reference::host::GemmComplex
|
||||
#include <cutlass/util/reference/host/gett.hpp> // cutlass::reference::host::GettBlockScalingMainloopParams
|
||||
// cutlass::reference::host::GettBlockScalingEpilogueParams
|
||||
// cutlass::reference::host::Gemm3x
|
||||
#include "cutlass/util/reference/host/tensor_compare.h" // cutlass::reference::host::TensorEquals
|
||||
#include "cutlass/util/reference/host/tensor_fill.h" // cutlass::reference::host::TensorFillRandomUniform
|
||||
#include "cutlass/numeric_size.h" // cutlass::bits_to_bytes
|
||||
|
||||
// Helper Functions
|
||||
template <typename T>
|
||||
auto
|
||||
make_iterator(T* ptr)
|
||||
{
|
||||
return cute::recast_ptr<T>(ptr);
|
||||
}
|
||||
|
||||
template <typename Element, typename Layout>
|
||||
bool
|
||||
initialize_tensor(cutlass::TensorView<Element, Layout> view, cutlass::Distribution::Kind dist_kind, uint64_t seed)
|
||||
{
|
||||
if (dist_kind == cutlass::Distribution::Uniform) {
|
||||
double scope_max, scope_min;
|
||||
int bits_input = cutlass::sizeof_bits<Element>::value;
|
||||
|
||||
if (bits_input == 1) {
|
||||
scope_max = 2;
|
||||
scope_min = 0;
|
||||
} else if (bits_input <= 6) {
|
||||
scope_max = 2;
|
||||
scope_min = -2;
|
||||
} else if (bits_input <= 8) {
|
||||
if constexpr (cutlass::is_same_v<Element, cutlass::float_ue4m3_t> ||
|
||||
cutlass::is_same_v<Element, cutlass::float_ue8m0_t>) {
|
||||
scope_max = 4;
|
||||
scope_min = 1;
|
||||
} else {
|
||||
scope_max = 1;
|
||||
scope_min = -1;
|
||||
}
|
||||
} else {
|
||||
scope_max = 4;
|
||||
scope_min = -4;
|
||||
}
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(view, seed, scope_max, scope_min, 0);
|
||||
}
|
||||
|
||||
else if (dist_kind == cutlass::Distribution::Identity) {
|
||||
cutlass::reference::host::TensorFillIdentity(view);
|
||||
}
|
||||
|
||||
else if (dist_kind == cutlass::Distribution::Gaussian) {
|
||||
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
|
||||
}
|
||||
|
||||
else if (dist_kind == cutlass::Distribution::Sequential) {
|
||||
cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
|
||||
}
|
||||
|
||||
else if (dist_kind == cutlass::Distribution::AllOnes) {
|
||||
cutlass::reference::host::TensorFill(view, Element(1));
|
||||
}
|
||||
|
||||
else if (dist_kind == cutlass::Distribution::AllZeros) {
|
||||
cutlass::reference::host::TensorFill(view, Element(0));
|
||||
}
|
||||
|
||||
else {
|
||||
CUTLASS_ASSERT(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Base class of Testbed
|
||||
template <
|
||||
typename Gemv_,
|
||||
// The following types are more difficult to be derived from EVT
|
||||
typename ElementC, typename LayoutC, typename ElementD_,
|
||||
typename LayoutD, typename ElementSFD_, typename LayoutSFD,
|
||||
typename ElementCompute_, int kVectorSize_>
|
||||
struct TestbedGemvFp4SFDBase
|
||||
{
|
||||
public:
|
||||
using Gemv = Gemv_;
|
||||
|
||||
using ElementA = typename Gemv::ElementA;
|
||||
using ElementSFA = typename Gemv::ElementSFA;
|
||||
using LayoutA = typename Gemv::LayoutA;
|
||||
static_assert(cutlass::is_same_v<LayoutA, cutlass::layout::RowMajor>, "only support row major matrix A");
|
||||
static_assert(cutlass::sizeof_bits<ElementSFA>::value == 8, "ElementSFA should be FP8 type");
|
||||
|
||||
using ElementB = typename Gemv::ElementB;
|
||||
using ElementSFB = typename Gemv::ElementSFB;
|
||||
using LayoutB = cutlass::layout::ColumnMajor;
|
||||
static_assert(cutlass::is_same_v<ElementA, ElementB>, "only support ElementA ElementB of same type");
|
||||
static_assert(cutlass::sizeof_bits<ElementSFB>::value == 8, "ElementSFB should be FP8 type");
|
||||
|
||||
static_assert(cutlass::is_same_v<LayoutC, cutlass::layout::ColumnMajor>, "only support col major output D");
|
||||
|
||||
using ElementD = ElementD_;
|
||||
static_assert(cutlass::is_same_v<LayoutD, cutlass::layout::ColumnMajor>, "only support col major output D");
|
||||
|
||||
using ElementSFD = ElementSFD_;
|
||||
static_assert(cutlass::is_same_v<LayoutSFD, cutlass::layout::ColumnMajor>, "only support col major output SFD");
|
||||
static_assert(cutlass::sizeof_bits<ElementSFD>::value, "only support 8 bit SFD");
|
||||
|
||||
using ElementAccumulator = typename Gemv::ElementAccumulator;
|
||||
using ElementCompute = ElementCompute_;
|
||||
static_assert(cutlass::is_same_v<ElementCompute, float>, "only support fp32 epi compute");
|
||||
|
||||
static constexpr int kVectorSize = kVectorSize_;
|
||||
static_assert(kVectorSize == 16, "only support vs 16");
|
||||
|
||||
// SFD Config
|
||||
static constexpr bool kIsKMajorSFD = cutlass::is_same_v<LayoutSFD, cutlass::layout::RowMajor>;
|
||||
using Sm1xxBlockScaledOutputConfig=
|
||||
cutlass::detail::Sm1xxBlockScaledOutputConfig<kVectorSize,
|
||||
kIsKMajorSFD ? cute::UMMA::Major::K : cute::UMMA::Major::MN>;
|
||||
using Blk_MN_Output = typename Sm1xxBlockScaledOutputConfig::Blk_MN;
|
||||
using Blk_SF_Output = typename Sm1xxBlockScaledOutputConfig::Blk_SF;
|
||||
using OutputSFAtom = typename Sm1xxBlockScaledOutputConfig::SfAtom;
|
||||
|
||||
// SFA SFB Config
|
||||
using Sm100BlockScaledInputConfig = cutlass::detail::Sm1xxBlockScaledConfig<kVectorSize>;
|
||||
using Blk_MN_Input = typename Sm100BlockScaledInputConfig::Blk_MN;
|
||||
using Blk_SF_Input = typename Sm100BlockScaledInputConfig::Blk_SF;
|
||||
using SfAtom_Input = typename Sm100BlockScaledInputConfig::SfAtom;
|
||||
|
||||
public:
|
||||
TestbedGemvFp4SFDBase(cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
|
||||
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
|
||||
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
|
||||
cutlass::Distribution::Kind init_D_ = cutlass::Distribution::Uniform,
|
||||
cutlass::Distribution::Kind init_SFA_ = cutlass::Distribution::Uniform,
|
||||
cutlass::Distribution::Kind init_SFB_ = cutlass::Distribution::Uniform,
|
||||
cutlass::Distribution::Kind init_SFD_ = cutlass::Distribution::Uniform,
|
||||
uint64_t seed_ = 2023)
|
||||
: init_A(init_A_)
|
||||
, init_B(init_B_)
|
||||
, init_C(init_C_)
|
||||
, init_D(init_D_)
|
||||
, init_SFA(init_SFA_)
|
||||
, init_SFB(init_SFB_)
|
||||
, init_SFD(init_SFD_)
|
||||
, seed(seed_)
|
||||
{
|
||||
}
|
||||
|
||||
bool initialize(cutlass::MatrixCoord problem_size, int32_t batch_count)
|
||||
{
|
||||
const int32_t gemm_m = problem_size.row();
|
||||
const int32_t gemm_k = problem_size.column();
|
||||
const int32_t gemm_n = 1;
|
||||
const int32_t gemm_batch = batch_count;
|
||||
|
||||
// Resize Config SFA/SFB
|
||||
auto k_blks_input = cutlass::ceil_div(gemm_k, cute::size<1>(shape(SfAtom_Input{})));
|
||||
auto m_blks_input = cutlass::ceil_div(gemm_m, Blk_MN_Input{});
|
||||
auto n_blks_input = cutlass::ceil_div(gemm_n, Blk_MN_Input{});
|
||||
|
||||
auto sfa_coord = cutlass::make_Coord(m_blks_input * Blk_MN_Input{} * gemm_batch, k_blks_input * Blk_SF_Input{});
|
||||
auto sfb_coord = cutlass::make_Coord(n_blks_input * Blk_MN_Input{} * gemm_batch, k_blks_input * Blk_SF_Input{});
|
||||
|
||||
auto sfa_resize_layout =
|
||||
cutlass::layout::Affine2Layout_Factory<LayoutA>::layout_factory(sfa_coord, typename LayoutA::Stride{});
|
||||
auto sfb_resize_layout =
|
||||
cutlass::layout::Affine2Layout_Factory<LayoutB>::layout_factory(sfb_coord, typename LayoutB::Stride{});
|
||||
|
||||
// Use the same SFD layout generation as reference for tensor creation
|
||||
using ProblemShapeType = cute::Shape<int, int, int, int>;
|
||||
auto problem_shape_MNKL = ProblemShapeType{gemm_m, gemm_n, gemm_k, gemm_batch};
|
||||
|
||||
// Generate the same layout as reference uses
|
||||
auto sfd_layout = Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL);
|
||||
|
||||
// Extract size from the generated layout and create coordinate
|
||||
auto sfd_size = cute::size(cute::filter_zeros(sfd_layout));
|
||||
auto sfd_coord = cutlass::make_Coord(sfd_size, 1); // Linear layout for HostTensor
|
||||
|
||||
auto sfd_resize_layout =
|
||||
cutlass::layout::Affine2Layout_Factory<LayoutSFD>::layout_factory(sfd_coord, typename LayoutSFD::Stride{});
|
||||
|
||||
// Resize Host
|
||||
this->reference_D.resize({gemm_batch * gemm_m, 1}); // D col major vector
|
||||
this->reference_SFD.resize(sfd_coord, sfd_resize_layout);
|
||||
|
||||
if (initialize_tensor(this->reference_D.host_view(), this->init_D, this->seed + 7) == false) {
|
||||
printf("initialize_tensor() REF D failed\n");
|
||||
return false;
|
||||
}
|
||||
if (initialize_tensor(this->reference_SFD.host_view(), this->init_SFD, this->seed + 9) == false) {
|
||||
printf("initialize_tensor() REF SFD failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Resize A/B/C/D
|
||||
this->tensor_A.resize({gemm_batch * gemm_m, gemm_k}); // A row major
|
||||
this->tensor_B.resize({gemm_batch * gemm_k, 1}); // B col major vector
|
||||
this->tensor_C.resize({gemm_batch * gemm_m, 1}); // C col major vector
|
||||
this->tensor_D.resize({gemm_batch * gemm_m, 1}); // D col major vector
|
||||
this->tensor_SFA.resize(sfa_coord, sfa_resize_layout);
|
||||
this->tensor_SFB.resize(sfb_coord, sfb_resize_layout);
|
||||
this->tensor_SFD.resize(sfd_coord, sfd_resize_layout);
|
||||
|
||||
// Fill A/B/C
|
||||
if (initialize_tensor(this->tensor_A.host_view(), this->init_A, this->seed + 1) == false) {
|
||||
printf("initialize_tensor() A failed\n");
|
||||
return false;
|
||||
}
|
||||
if (initialize_tensor(this->tensor_B.host_view(), this->init_B, this->seed + 2) == false) {
|
||||
printf("initialize_tensor() B failed\n");
|
||||
return false;
|
||||
}
|
||||
if (initialize_tensor(this->tensor_C.host_view(), this->init_C, this->seed + 3) == false) {
|
||||
printf("initialize_tensor() C failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Fill SFA/SFB
|
||||
if (initialize_tensor(this->tensor_SFA.host_view(), this->init_SFA, this->seed + 4) == false) {
|
||||
printf("initialize_tensor() SFA failed\n");
|
||||
return false;
|
||||
}
|
||||
if (initialize_tensor(this->tensor_SFB.host_view(), this->init_SFB, this->seed + 5) == false) {
|
||||
printf("initialize_tensor() SFB failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Fill D/SFD
|
||||
if (initialize_tensor(this->tensor_D.host_view(), this->init_D, this->seed + 6) == false) {
|
||||
printf("initialize_tensor() D failed\n");
|
||||
return false;
|
||||
}
|
||||
if (initialize_tensor(this->tensor_SFD.host_view(), this->init_SFD, this->seed + 8) == false) {
|
||||
printf("initialize_tensor() SFD failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Copy A/B/C from host to device
|
||||
this->tensor_A.sync_device();
|
||||
this->tensor_B.sync_device();
|
||||
this->tensor_C.sync_device();
|
||||
this->tensor_D.sync_device();
|
||||
this->tensor_SFA.sync_device();
|
||||
this->tensor_SFB.sync_device();
|
||||
this->tensor_SFD.sync_device();
|
||||
|
||||
// SFD initialization is different.
|
||||
// Init referenceSFD on host first, and then copy data to tensorSFD device side.
|
||||
// This ensures tensorSFD and referenceSFD to have same data,
|
||||
// otherwise the "bubbles" due to SFD layouts can lead to false negative sanity check.
|
||||
cutlass::device_memory::copy_to_host(this->reference_SFD.host_data(), this->tensor_SFD.device_data(), sfd_size);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool compare_reference()
|
||||
{
|
||||
// device -> host
|
||||
this->tensor_D.sync_host();
|
||||
|
||||
bool passed = true;
|
||||
|
||||
// Check
|
||||
passed = cutlass::reference::host::TensorEquals(this->reference_D.host_view(), this->tensor_D.host_view());
|
||||
if (passed == false) {
|
||||
printf("gemm_m: %d, gemm_k: %d, ", this->tensor_A.host_view().extent(0), this->tensor_A.host_view().extent(1));
|
||||
printf("tensorD mismatch\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
this->tensor_SFD.sync_host();
|
||||
|
||||
passed = cutlass::reference::host::TensorEquals(this->reference_SFD.host_view(), this->tensor_SFD.host_view());
|
||||
if (passed == false) {
|
||||
printf("gemm_m: %d, gemm_k: %d, ", this->tensor_A.host_view().extent(0), this->tensor_A.host_view().extent(1));
|
||||
printf("tensorSFD mismatch\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
return passed;
|
||||
}
|
||||
|
||||
bool run_reference(cutlass::MatrixCoord problem_size,
|
||||
int32_t batch_count,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta,
|
||||
float epilogue_st)
|
||||
{
|
||||
const int32_t gemm_m = problem_size.row();
|
||||
const int32_t gemm_k = problem_size.column();
|
||||
const int32_t gemm_n = 1;
|
||||
const int32_t gemm_batch = batch_count;
|
||||
|
||||
// Run reference blockscale GETT
|
||||
using ProblemShapeType = cute::Shape<int, int, int, int>;
|
||||
auto problem_shape_MNKL = ProblemShapeType{gemm_m, gemm_n, gemm_k, gemm_batch};
|
||||
auto SfD = make_tensor(make_iterator(this->reference_SFD.host_data()),
|
||||
Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
|
||||
|
||||
using StrideA = cutlass::gemm::TagToStrideA_t<LayoutA>;
|
||||
using StrideB = cutlass::gemm::TagToStrideB_t<LayoutB>;
|
||||
using StrideC = cutlass::gemm::TagToStrideC_t<LayoutC>;
|
||||
using StrideD = cutlass::gemm::TagToStrideC_t<LayoutD>;
|
||||
StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(gemm_m, gemm_k, gemm_batch));
|
||||
StrideB stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(gemm_n, gemm_k, gemm_batch));
|
||||
StrideC stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(gemm_m, gemm_n, gemm_batch));
|
||||
StrideD stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(gemm_m, gemm_n, gemm_batch));
|
||||
|
||||
auto A = make_tensor(make_iterator(this->tensor_A.host_data()),
|
||||
cute::make_layout(cute::make_shape(gemm_m, gemm_k, gemm_batch), stride_a));
|
||||
auto B = make_tensor(make_iterator(this->tensor_B.host_data()),
|
||||
cute::make_layout(cute::make_shape(gemm_n, gemm_k, gemm_batch), stride_b));
|
||||
|
||||
auto C = cute::make_tensor(make_iterator(this->tensor_C.host_data()),
|
||||
cute::make_layout(cute::make_shape(gemm_m, gemm_n, gemm_batch), stride_c));
|
||||
auto D = cute::make_tensor(make_iterator(this->reference_D.host_data()),
|
||||
cute::make_layout(cute::make_shape(gemm_m, gemm_n, gemm_batch), stride_d));
|
||||
|
||||
auto layout_sfa = Sm100BlockScaledInputConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
|
||||
auto layout_sfb = Sm100BlockScaledInputConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
|
||||
|
||||
auto SfA = make_tensor(this->tensor_SFA.host_data(), layout_sfa);
|
||||
auto SfB = make_tensor(this->tensor_SFB.host_data(), layout_sfb);
|
||||
|
||||
// Internally scale factor of mainloop will be disabled when ElementA/B == ElementSFA/B.
|
||||
typename cutlass::reference::host::GettBlockScalingMainloopParams<ElementAccumulator, // ElementAccumulator
|
||||
decltype(A), // TensorA
|
||||
decltype(SfA), // TensorSfA
|
||||
decltype(B), // TensorB
|
||||
decltype(SfB) // TensorSfB
|
||||
>
|
||||
mainloop_params{A, SfA, B, SfB};
|
||||
|
||||
typename cutlass::reference::host::GettBlockScalingEpilogueParams<ElementCompute, // ElementScalar
|
||||
ElementAccumulator, // ElementAccumulator
|
||||
ElementCompute, // ElementCompute
|
||||
decltype(C), // TensorC
|
||||
decltype(D), // TensorD
|
||||
decltype(SfD), // TensorSfD
|
||||
cute::Int<kVectorSize>, // OutputVectorSize
|
||||
cutlass::reference::host::SfStrategy::SfDGen
|
||||
>
|
||||
epilogue_params{alpha, beta, C, D, SfD, epilogue_st};
|
||||
|
||||
cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual typename Gemv::Arguments get_arguments(
|
||||
cutlass::MatrixCoord problem_size, int32_t batch_count,
|
||||
float epilogue_st, ElementCompute alpha, ElementCompute beta) = 0;
|
||||
|
||||
bool run_gemv(cutlass::MatrixCoord problem_size,
|
||||
int32_t batch_count,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta,
|
||||
[[maybe_unused]] float epilogue_st,
|
||||
bool is_profiling,
|
||||
int kIterations)
|
||||
{
|
||||
|
||||
// Not support batch input for testing
|
||||
const int32_t gemm_m = problem_size.row();
|
||||
const int32_t gemm_k = problem_size.column();
|
||||
[[maybe_unused]] const int32_t gemm_n = 1;
|
||||
[[maybe_unused]] const int32_t gemm_batch = batch_count;
|
||||
|
||||
Gemv gemv_op;
|
||||
typename Gemv::Arguments arguments = this->get_arguments(
|
||||
problem_size, batch_count, epilogue_st, alpha, beta
|
||||
);
|
||||
|
||||
cutlass::Status status = gemv_op.can_implement(arguments);
|
||||
if (status != cutlass::Status::kSuccess) {
|
||||
printf("can_implement() failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t workspace_size = Gemv::get_workspace_size(arguments);
|
||||
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
|
||||
|
||||
status = gemv_op.initialize(arguments, workspace.get());
|
||||
if (status != cutlass::Status::kSuccess) {
|
||||
printf("initialize() failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (not is_profiling) {
|
||||
status = gemv_op();
|
||||
}
|
||||
// profiling
|
||||
else {
|
||||
cudaError_t result;
|
||||
cudaEvent_t events[2];
|
||||
|
||||
for (cudaEvent_t &evt : events) {
|
||||
result = cudaEventCreate(&evt);
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "cudaEventCreate failed with error " << cudaGetErrorString(result) << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// warmup
|
||||
status = gemv_op();
|
||||
if (status != cutlass::Status::kSuccess) {
|
||||
std::cerr << "Device execution failed on warmup." << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
result = cudaEventRecord(events[0]);
|
||||
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int iter_i = 0; iter_i < kIterations; ++iter_i) {
|
||||
status = gemv_op();
|
||||
if (status != cutlass::Status::kSuccess) {
|
||||
std::cerr << "Device execution failed." << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
result = cudaEventRecord(events[1]);
|
||||
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "cudaEventRecord() failed with error " << cudaGetErrorString(result) << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
result = cudaDeviceSynchronize();
|
||||
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "cudaDeviceSynchronize() failed with error " << cudaGetErrorString(result) << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
float elapsed_ms = 0;
|
||||
result = cudaEventElapsedTime(&elapsed_ms, events[0], events[1]);
|
||||
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "cudaEventElapsedTime() failed with error " << cudaGetErrorString(result) << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
for (cudaEvent_t &evt : events) {
|
||||
result = cudaEventDestroy(evt);
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "cudaEventDestroy() failed with error " << cudaGetErrorString(result) << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t flops = int64_t(gemm_m) * gemm_n * gemm_k * 2;
|
||||
int64_t bytes = cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementA>) * int64_t(gemm_m) * int64_t(gemm_k)) +
|
||||
cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementB>) * int64_t(gemm_k) * int64_t(gemm_n)) +
|
||||
cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementD>) * int64_t(gemm_m) * int64_t(gemm_n)) +
|
||||
cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementSFA>) * int64_t(gemm_m) * int64_t(gemm_k) / int64_t(kVectorSize)) +
|
||||
cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementSFB>) * int64_t(gemm_k) * int64_t(gemm_n) / int64_t(kVectorSize)) +
|
||||
cutlass::bits_to_bytes<int64_t>(int64_t(cute::sizeof_bits_v<ElementSFD>) * int64_t(gemm_m) * int64_t(gemm_n) / int64_t(kVectorSize));
|
||||
|
||||
double gflops_per_second = double(flops) * kIterations * gemm_batch / double(elapsed_ms / 1000.0f) / double(1.0e9);
|
||||
double gbytes_per_second = double(bytes) * kIterations * gemm_batch / double(elapsed_ms / 1000.0f) / double(1 << 30);
|
||||
double elapsed_ms_per_iter = double(elapsed_ms) / kIterations;
|
||||
|
||||
std::cout << " Problem: "
|
||||
<< gemm_m << "-by-" << gemm_n << "-by-" << gemm_k
|
||||
<< ", batch size: " << gemm_batch
|
||||
<< std::endl;
|
||||
std::cout << " Runtime: " << elapsed_ms_per_iter << " ms" << std::endl;
|
||||
std::cout << " GFLOPs: " << gflops_per_second << " GFLOPs" << std::endl;
|
||||
std::cout << "Memory bandwidth: " << gbytes_per_second << " GiB/s" << std::endl;
|
||||
|
||||
}
|
||||
|
||||
if (status != cutlass::Status::kSuccess) {
|
||||
printf("gemv exec failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool run_and_verify(cutlass::MatrixCoord problem_size,
|
||||
int32_t batch_count,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta,
|
||||
float epilogue_st)
|
||||
{
|
||||
|
||||
// Initialize Data
|
||||
if (this->initialize(problem_size, batch_count) == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Run GEMV kernel
|
||||
if (this->run_gemv(problem_size, batch_count, alpha, beta, epilogue_st, false /*is_profiling*/, 1) == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Run Reference Kernel
|
||||
if (this->run_reference(problem_size, batch_count, alpha, beta, epilogue_st) == false) {
|
||||
printf("run_reference() failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Verify
|
||||
if (this->compare_reference() == false) {
|
||||
printf("compare_reference() failed\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool profile(cutlass::MatrixCoord problem_size,
|
||||
int32_t batch_count,
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta,
|
||||
float epilogue_st,
|
||||
int kIterations = 10)
|
||||
{
|
||||
// Initialize Data
|
||||
if (this->initialize(problem_size, batch_count) == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Profile GEMV kernel
|
||||
if (this->run_gemv(problem_size, batch_count, alpha, beta, epilogue_st, true /*is_profiling*/, kIterations) == false) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
// Data Storage
|
||||
cutlass::HostTensor<ElementA, LayoutA> tensor_A;
|
||||
cutlass::HostTensor<ElementSFA, LayoutA> tensor_SFA;
|
||||
|
||||
cutlass::HostTensor<ElementB, LayoutB> tensor_B;
|
||||
cutlass::HostTensor<ElementSFB, LayoutB> tensor_SFB;
|
||||
|
||||
cutlass::HostTensor<ElementC, LayoutC> tensor_C;
|
||||
|
||||
cutlass::HostTensor<ElementD, LayoutD> tensor_D;
|
||||
cutlass::HostTensor<ElementSFD, LayoutD> tensor_SFD;
|
||||
|
||||
cutlass::HostTensor<ElementD, LayoutD> reference_D;
|
||||
cutlass::HostTensor<ElementSFD, LayoutD> reference_SFD;
|
||||
|
||||
// Data Init Setting
|
||||
cutlass::Distribution::Kind init_A;
|
||||
cutlass::Distribution::Kind init_B;
|
||||
cutlass::Distribution::Kind init_C;
|
||||
cutlass::Distribution::Kind init_D;
|
||||
cutlass::Distribution::Kind init_SFA;
|
||||
cutlass::Distribution::Kind init_SFB;
|
||||
cutlass::Distribution::Kind init_SFD;
|
||||
uint64_t seed;
|
||||
};
|
||||
|
||||
template<typename Gemv_>
|
||||
struct TestbedGemvFp4SFD : public TestbedGemvFp4SFDBase<
|
||||
Gemv_,
|
||||
typename Gemv_::ElementC,
|
||||
typename Gemv_::EpilogueOutputOp::LayoutOutput,
|
||||
typename Gemv_::EpilogueOutputOp::ElementD,
|
||||
typename Gemv_::EpilogueOutputOp::LayoutOutput,
|
||||
typename Gemv_::EpilogueOutputOp::ElementSFD,
|
||||
typename Gemv_::EpilogueOutputOp::LayoutSFD,
|
||||
typename Gemv_::EpilogueOutputOp::ElementCompute,
|
||||
Gemv_::EpilogueOutputOp::kVectorSize
|
||||
> {
|
||||
using Base = TestbedGemvFp4SFDBase<
|
||||
Gemv_,
|
||||
typename Gemv_::ElementC,
|
||||
typename Gemv_::EpilogueOutputOp::LayoutOutput,
|
||||
typename Gemv_::EpilogueOutputOp::ElementD,
|
||||
typename Gemv_::EpilogueOutputOp::LayoutOutput,
|
||||
typename Gemv_::EpilogueOutputOp::ElementSFD,
|
||||
typename Gemv_::EpilogueOutputOp::LayoutSFD,
|
||||
typename Gemv_::EpilogueOutputOp::ElementCompute,
|
||||
Gemv_::EpilogueOutputOp::kVectorSize
|
||||
>;
|
||||
|
||||
using Base::Base;
|
||||
using Gemv = Gemv_;
|
||||
using ElementCompute = typename Base::ElementCompute;
|
||||
using SfAtom_Input = typename Base::SfAtom_Input;
|
||||
using Blk_MN_Input = typename Base::Blk_MN_Input;
|
||||
using Blk_SF_Input = typename Base::Blk_SF_Input;
|
||||
|
||||
static constexpr int kVectorSize = Base::kVectorSize;
|
||||
|
||||
typename Gemv::Arguments get_arguments(
|
||||
cutlass::MatrixCoord problem_size,
|
||||
int32_t batch_count, float epilogue_st,
|
||||
ElementCompute alpha, ElementCompute beta) override {
|
||||
|
||||
const int32_t gemm_m = problem_size.row();
|
||||
const int32_t gemm_k = problem_size.column();
|
||||
[[maybe_unused]] const int32_t gemm_n = 1;
|
||||
[[maybe_unused]] const int32_t gemm_batch = batch_count;
|
||||
|
||||
auto k_blks_input = cutlass::ceil_div(gemm_k, cute::size<1>(shape(SfAtom_Input{})));
|
||||
auto m_blks_input = cutlass::ceil_div(gemm_m, Blk_MN_Input{});
|
||||
auto n_blks_input = cutlass::ceil_div(gemm_n, Blk_MN_Input{});
|
||||
|
||||
int batch_stride_SFA = m_blks_input * Blk_MN_Input{} * k_blks_input * Blk_SF_Input{};
|
||||
int batch_stride_SFB = n_blks_input * Blk_MN_Input{} * k_blks_input * Blk_SF_Input{};
|
||||
|
||||
// Use the same SFD layout generation as reference to get correct batch stride
|
||||
using ProblemShapeType = cute::Shape<int, int, int, int>;
|
||||
auto problem_shape_MNKL = ProblemShapeType{gemm_m, gemm_n, gemm_k, gemm_batch};
|
||||
|
||||
// Generate the same layout as reference uses
|
||||
using Sm1xxBlockScaledOutputConfig = typename Base::Sm1xxBlockScaledOutputConfig;
|
||||
auto sfd_layout = Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL);
|
||||
|
||||
// Calculate batch stride from the generated layout
|
||||
// Extract the batch stride from the 3rd dimension stride
|
||||
// The stride<2> gives us the stride for the batch dimension
|
||||
auto batch_stride_tuple = cute::stride<2>(sfd_layout); // This returns (_0, 8192)
|
||||
int batch_stride_SFD = static_cast<int>(cute::get<1>(batch_stride_tuple)); // Extract the 8192 part
|
||||
|
||||
// Initialize GEMV kernel
|
||||
typename Gemv::Arguments arguments{
|
||||
problem_size, // problem_size
|
||||
batch_count, // batch_count
|
||||
typename Gemv::EpilogueOutputOp::Params{
|
||||
this->tensor_D.device_ref(), // tensor_d
|
||||
this->tensor_SFD.device_data(), // scale_factor_d_ptr
|
||||
alpha, // alpha
|
||||
beta, // beta
|
||||
epilogue_st, // st
|
||||
batch_stride_SFD, // batch_stride_sfd
|
||||
gemm_m // stride_d
|
||||
},
|
||||
this->tensor_A.device_ref(), // ref_A
|
||||
this->tensor_B.device_data(), // ptr_B
|
||||
this->tensor_C.device_data(), // ptr_C
|
||||
this->tensor_D.device_data(), // ptr_D
|
||||
this->tensor_SFA.device_data(), // ptr_SFA
|
||||
this->tensor_SFB.device_data(), // ptr_SFB
|
||||
gemm_k, // stride_A
|
||||
gemm_m * gemm_k, // batch_stride_A
|
||||
gemm_k, // batch_stride_B
|
||||
gemm_m, // batch_stride_C
|
||||
gemm_m, // batch_stride_D
|
||||
batch_stride_SFA, // batch_stride_SFA
|
||||
batch_stride_SFB, // batch_stride_SFB
|
||||
batch_stride_SFD // batch_stride_SFD
|
||||
};
|
||||
|
||||
return arguments;
|
||||
}
|
||||
};
|
||||
|
||||
struct Options {
|
||||
bool help = false;
|
||||
|
||||
int m = 4096;
|
||||
int k = 2048;
|
||||
int n = 1;
|
||||
int batch = 1;
|
||||
|
||||
float alpha = 1.0f;
|
||||
float beta = 0.0f;
|
||||
float epilogue_st = -1.0f; // sentinel for random
|
||||
|
||||
bool profiling = true;
|
||||
int iterations = 10;
|
||||
|
||||
// Parses the command line
|
||||
void parse(int argc, char const **args) {
|
||||
cutlass::CommandLine cmd(argc, args);
|
||||
|
||||
if (cmd.check_cmd_line_flag("help")) {
|
||||
help = true;
|
||||
return;
|
||||
}
|
||||
|
||||
cmd.get_cmd_line_argument("m", m);
|
||||
cmd.get_cmd_line_argument("k", k);
|
||||
cmd.get_cmd_line_argument("batch", batch);
|
||||
cmd.get_cmd_line_argument("alpha", alpha);
|
||||
cmd.get_cmd_line_argument("beta", beta);
|
||||
cmd.get_cmd_line_argument("epilogue_st", epilogue_st);
|
||||
cmd.get_cmd_line_argument("profiling", profiling);
|
||||
cmd.get_cmd_line_argument("iterations", iterations);
|
||||
}
|
||||
|
||||
/// Prints the usage statement.
|
||||
std::ostream & print_usage(std::ostream &out) const {
|
||||
|
||||
out << "91_fp4_gemv\n\n"
|
||||
<< " FP4 GEMV with block-scaled inputs and outputs.\n\n"
|
||||
<< "Options:\n\n"
|
||||
<< " --help If specified, displays this usage statement\n\n"
|
||||
<< " --m=<int> Sets the M extent of the GEMM\n"
|
||||
<< " --k=<int> Sets the K extent of the GEMM\n"
|
||||
<< " --batch=<int> Sets the batch count of the GEMM\n"
|
||||
<< " --alpha=<f32> Epilogue scalar alpha\n"
|
||||
<< " --beta=<f32> Epilogue scalar beta\n"
|
||||
<< " --epilogue_st=<f32> Epilogue ST value\n\n"
|
||||
<< " --profiling=<bool> Whether to run profiling\n\n"
|
||||
<< " --iterations=<int> Number of profiling iterations to perform\n\n";
|
||||
|
||||
out
|
||||
<< "\n\nExamples:\n\n"
|
||||
<< "$ " << "91_fp4_gemv" << " --m=4096 --k=2048 --batch=1 \n\n";
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
bool
|
||||
run_fp4_gemv_device(Options const& options)
|
||||
{
|
||||
CUTLASS_ASSERT(options.n == 1);
|
||||
|
||||
using ElementA = cutlass::float_e2m1_t;
|
||||
using ElementSFA = cutlass::float_e4m3_t;
|
||||
using LayoutA = cutlass::layout::RowMajor;
|
||||
|
||||
using ElementB = cutlass::float_e2m1_t;
|
||||
using ElementSFB = cutlass::float_e4m3_t;
|
||||
|
||||
using ElementC = cutlass::float_e2m1_t;
|
||||
|
||||
using ElementD = cutlass::float_e2m1_t;
|
||||
using LayoutD = cutlass::layout::ColumnMajor;
|
||||
|
||||
using ElementSFD = cutlass::float_e4m3_t;
|
||||
// Indicate SF is computed along col dim. Does NOT indicate actual layout of SFD
|
||||
using LayoutSFD = cutlass::layout::ColumnMajor;
|
||||
|
||||
using ElementAccumulatorMainloop = cutlass::half_t;
|
||||
using ElementAccumulator = float;
|
||||
using ElementCompute = float;
|
||||
|
||||
ElementCompute alpha{options.alpha};
|
||||
ElementCompute beta{options.beta};
|
||||
// Must be a positive number.
|
||||
const float epilogue_st = options.epilogue_st < 0.f ?
|
||||
static_cast<float>(rand()) / (static_cast<float>(RAND_MAX / 5)) :
|
||||
options.epilogue_st;
|
||||
|
||||
static constexpr int kVectorSize = 16;
|
||||
static constexpr int kElementsPerAccess = 128 / cutlass::sizeof_bits<ElementA>::value;
|
||||
|
||||
using ThreadShape = cutlass::gemm::GemmShape<16, 8>;
|
||||
static_assert(kVectorSize == ThreadShape::kM, "vector size and thread in row should be equal");
|
||||
|
||||
// Construct Epilogue
|
||||
using EpilogueOp = typename cutlass::epilogue::threadblock::GemvEpilogueWithScalingFactor<kVectorSize,
|
||||
ThreadShape,
|
||||
ElementCompute,
|
||||
ElementAccumulator,
|
||||
ElementC,
|
||||
ElementD,
|
||||
ElementSFD,
|
||||
LayoutD,
|
||||
LayoutSFD>;
|
||||
|
||||
// Construct Mainloop
|
||||
using Gemv = cutlass::gemm::device::GemvBlockScaled<
|
||||
cutlass::gemm::kernel::
|
||||
GemvBlockScaled<ElementA, LayoutA, ElementB, ElementD, ElementAccumulatorMainloop, EpilogueOp, kElementsPerAccess>>;
|
||||
|
||||
TestbedGemvFp4SFD<Gemv> testbed;
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if (options.profiling) {
|
||||
pass = testbed.profile(cutlass::MatrixCoord{options.m, options.k}, options.batch, alpha, beta, epilogue_st, options.iterations);
|
||||
}
|
||||
else {
|
||||
pass = testbed.run_and_verify(cutlass::MatrixCoord{options.m, options.k}, options.batch, alpha, beta, epilogue_st);
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char const** argv)
|
||||
{
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
Options options;
|
||||
options.parse(argc, argv);
|
||||
|
||||
if (options.help) {
|
||||
options.print_usage(std::cout) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Run verification
|
||||
Options verification_options = options;
|
||||
verification_options.profiling = false;
|
||||
|
||||
bool passed = run_fp4_gemv_device(verification_options);
|
||||
if (passed == false) {
|
||||
printf("test fail\n");
|
||||
return 1;
|
||||
} else {
|
||||
printf("test pass\n");
|
||||
}
|
||||
|
||||
|
||||
if (options.profiling) {
|
||||
// Start profiling
|
||||
printf("\nProfiling...\n");
|
||||
passed = run_fp4_gemv_device(options);
|
||||
if (passed == false) {
|
||||
printf("profiling fail\n");
|
||||
return 1;
|
||||
} else {
|
||||
printf("profiling completed\n");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
#else
|
||||
std::cerr << "Unsupported example. Please ensure CUTLASS_ARCH_MMA_SM100_SUPPORTED is defined.\n";
|
||||
return 0;
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
}
|
||||
36
examples/91_fp4_gemv/CMakeLists.txt
Normal file
36
examples/91_fp4_gemv/CMakeLists.txt
Normal file
@ -0,0 +1,36 @@
|
||||
# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
if (NOT MSVC)
|
||||
|
||||
cutlass_example_add_executable(
|
||||
91_fp4_gemv
|
||||
91_fp4_gemv.cu
|
||||
)
|
||||
|
||||
endif()
|
||||
@ -163,7 +163,12 @@ foreach(EXAMPLE
|
||||
82_blackwell_distributed_gemm
|
||||
83_blackwell_sparse_gemm
|
||||
84_blackwell_narrow_precision_sparse_gemm
|
||||
86_blackwell_mixed_dtype_gemm
|
||||
87_blackwell_geforce_gemm_blockwise
|
||||
88_hopper_fmha
|
||||
89_sm103_fp4_ultra_gemm
|
||||
90_sm103_fp4_ultra_grouped_gemm
|
||||
91_fp4_gemv
|
||||
)
|
||||
|
||||
add_subdirectory(${EXAMPLE})
|
||||
|
||||
@ -1,5 +1,14 @@
|
||||
# CUTLASS - Programming Examples
|
||||
|
||||
> [!IMPORTANT]
|
||||
> ### ⚠️ **Not for Benchmarking!** ⚠️
|
||||
>
|
||||
> These examples are designed **solely for demonstrating CUTLASS functionality** and may **NOT optimized for performance benchmarking**.
|
||||
>
|
||||
> **For accurate performance measurements**, please use the **[CUTLASS Profiler](../tools/profiler/)** instead (recommended) or manually auto-tune the example, if unavailable via the profiler.
|
||||
>
|
||||
|
||||
|
||||
* [00_basic_gemm](00_basic_gemm/)
|
||||
|
||||
launches a basic GEMM with single precision inputs and outputs
|
||||
|
||||
@ -39,14 +39,13 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include <iostream>
|
||||
#include <cuda/atomic>
|
||||
#include <cuda/std/atomic>
|
||||
#include CUDA_STD_HEADER(atomic)
|
||||
|
||||
#include "cute/layout.hpp"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/cuda_host_adapter.hpp"
|
||||
|
||||
|
||||
|
||||
@ -452,8 +452,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
|
||||
dim3 dimBlock(128);
|
||||
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
|
||||
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
|
||||
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
|
||||
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
|
||||
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
|
||||
int smemBytes = sizeof(SMEMStorage);
|
||||
|
||||
auto* kernel_ptr = &gemm_device<SMEMStorage,
|
||||
|
||||
@ -528,8 +528,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
|
||||
dim3 dimBlock(128);
|
||||
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
|
||||
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
|
||||
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
|
||||
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
|
||||
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
|
||||
int smemBytes = sizeof(SMEMStorage);
|
||||
|
||||
auto* kernel_ptr = &gemm_device<SMEMStorage,
|
||||
|
||||
@ -567,8 +567,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
|
||||
dim3 dimBlock(128);
|
||||
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
|
||||
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
|
||||
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
|
||||
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
|
||||
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
|
||||
int smemBytes = sizeof(SMEMStorage);
|
||||
|
||||
auto* kernel_ptr = &gemm_device<SMEMStorage,
|
||||
|
||||
@ -575,6 +575,7 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
|
||||
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
|
||||
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
|
||||
|
||||
int smemBytes = sizeof(SMEMStorage);
|
||||
|
||||
auto* kernel_ptr = &gemm_device<SMEMStorage,
|
||||
|
||||
@ -681,8 +681,8 @@ void gemm_host_f16xf16_f32_f32_tnt(TypeA const* device_ptr_A, LayoutA layout_A,
|
||||
|
||||
dim3 dimBlock(128);
|
||||
dim3 dimCluster(size<0>(cluster_shape), size<1>(cluster_shape), size<2>(cluster_shape));
|
||||
dim3 dimGrid(round_up(size(ceil_div(Gemm_M, bM)), dimCluster.x),
|
||||
round_up(size(ceil_div(Gemm_N, bN)), dimCluster.y));
|
||||
dim3 dimGrid(size(ceil_div(Gemm_M, bM * size<1>(cluster_layout_vmnk))) * dimCluster.x,
|
||||
size(ceil_div(Gemm_N, bN * size<2>(cluster_layout_vmnk))) * dimCluster.y);
|
||||
int smemBytes = sizeof(SMEMStorage);
|
||||
|
||||
auto* kernel_ptr = &gemm_device<SMEMStorage,
|
||||
|
||||
@ -69,7 +69,7 @@
|
||||
"import numpy as np\n",
|
||||
"import random\n",
|
||||
"\n",
|
||||
"import cutlass\n",
|
||||
"import cutlass_cppgen\n",
|
||||
"\n",
|
||||
"# This controls whether the C++ GEMM declaration will be printed at each step. \n",
|
||||
"# Set to `False` to omit this information.\n",
|
||||
@ -106,7 +106,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Declaring and running a GEMM\n",
|
||||
"To get started, one only needs to provide the tensors declared above to the `cutlass.op.Gemm` call.\n",
|
||||
"To get started, one only needs to provide the tensors declared above to the `cutlass_cppgen.op.Gemm` call.\n",
|
||||
"This sets up a default GEMM operation for the given device on which you are running.\n",
|
||||
"\n",
|
||||
"Assuming that we are running on SM80, this default to using a GEMM that leverages FP16 Tensor Core operations.\n",
|
||||
@ -123,7 +123,7 @@
|
||||
"source": [
|
||||
"# We specify `element_accumulator` here so as to match the kernel run by NumPy below. However,\n",
|
||||
"# specifying `element_accumulator` is not required if it is the same as `element`\n",
|
||||
"plan = cutlass.Gemm(element=dtype, layout=cutlass.LayoutType.RowMajor, element_accumulator=np.float32)\n",
|
||||
"plan = cutlass_cppgen.Gemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor, element_accumulator=np.float32)\n",
|
||||
"plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
|
||||
]
|
||||
},
|
||||
@ -133,7 +133,7 @@
|
||||
"id": "4a5856de",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"There are many other ways to construct a plan from `cutlass.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass.op.Gemm` constructor."
|
||||
"There are many other ways to construct a plan from `cutlass_cppgen.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass_cppgen.op.Gemm` constructor."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -172,7 +172,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Changing operation modes\n",
|
||||
"By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to `cutlass.op.Gemm` is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.\n",
|
||||
"By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to `cutlass_cppgen.op.Gemm` is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.\n",
|
||||
"\n",
|
||||
"The operation mode currently in use can be returned via the `plan.opclass` property. In this case Tensor Core operations."
|
||||
]
|
||||
@ -197,7 +197,7 @@
|
||||
"\n",
|
||||
"As is shown in the printed output, the emitted kernel uses template parameters that fit CUTLASS's SIMT GEMMs.\n",
|
||||
"\n",
|
||||
"Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization."
|
||||
"Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass_cppgen.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -208,7 +208,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tensor_D_simt = np.zeros(tensor_C.shape).astype(type_D)\n",
|
||||
"plan.opclass = cutlass.OpcodeClass.Simt\n",
|
||||
"plan.opclass = cutlass_cppgen.OpcodeClass.Simt\n",
|
||||
"plan.run(tensor_A, tensor_B, tensor_C, tensor_D_simt, alpha, beta, print_module=print_module)"
|
||||
]
|
||||
},
|
||||
@ -262,7 +262,7 @@
|
||||
"alpha = np.float16(1.)\n",
|
||||
"beta = np.float16(2.)\n",
|
||||
"\n",
|
||||
"plan.opclass = cutlass.OpcodeClass.TensorOp\n",
|
||||
"plan.opclass = cutlass_cppgen.OpcodeClass.TensorOp\n",
|
||||
"plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
|
||||
]
|
||||
},
|
||||
@ -336,13 +336,13 @@
|
||||
"# Stream K is exposed through the threadblock swizzle method for pre-SM90 kernels,\n",
|
||||
"# and via the tile_scheduler attribute of the TileDescription for post-SM90 kernels\n",
|
||||
"if plan.cc < 90:\n",
|
||||
" plan.swizzling_functor = cutlass.swizzle.ThreadblockSwizzleStreamK\n",
|
||||
" plan.swizzling_functor = cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK\n",
|
||||
" plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)\n",
|
||||
"else:\n",
|
||||
" # Stream-K is currently only supported for warp-specialized cooperative kernels\n",
|
||||
" td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative\n",
|
||||
" td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative\n",
|
||||
" td.tile_scheduler = cutlass.TileSchedulerType.StreamK\n",
|
||||
" td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative\n",
|
||||
" td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative\n",
|
||||
" td.tile_scheduler = cutlass_cppgen.TileSchedulerType.StreamK\n",
|
||||
"\n",
|
||||
" plan.compile(td)\n",
|
||||
" plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
|
||||
@ -391,12 +391,12 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from cutlass.backend.utils.device import device_cc\n",
|
||||
"from cutlass_cppgen.backend.utils.device import device_cc\n",
|
||||
"\n",
|
||||
"# 3xTF32 requires SM80 or higher\n",
|
||||
"if device_cc() >= 80:\n",
|
||||
" plan = cutlass.op.Gemm(element=np.float32, layout=cutlass.LayoutType.RowMajor)\n",
|
||||
" plan.math_operation = cutlass.MathOperation.multiply_add_fast_f32\n",
|
||||
" plan = cutlass_cppgen.op.Gemm(element=np.float32, layout=cutlass_cppgen.LayoutType.RowMajor)\n",
|
||||
" plan.math_operation = cutlass_cppgen.MathOperation.multiply_add_fast_f32\n",
|
||||
"\n",
|
||||
" # Create input/output tensors in FP32\n",
|
||||
" A, B = [np.ones((128, 128)).astype(np.float32) for _ in range(2)]\n",
|
||||
@ -433,9 +433,9 @@
|
||||
"\n",
|
||||
"# FP8 is supported through the CUTLASS Python interface on SM90 and higher\n",
|
||||
"if device_cc() >= 90:\n",
|
||||
" plan = cutlass.op.Gemm(element=torch.float8_e4m3fn, element_C=torch.float32, element_accumulator=torch.float32,\n",
|
||||
" layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.ColumnMajor,\n",
|
||||
" layout_C=cutlass.LayoutType.ColumnMajor)\n",
|
||||
" plan = cutlass_cppgen.op.Gemm(element=torch.float8_e4m3fn, element_C=torch.float32, element_accumulator=torch.float32,\n",
|
||||
" layout_A=cutlass_cppgen.LayoutType.RowMajor, layout_B=cutlass_cppgen.LayoutType.ColumnMajor,\n",
|
||||
" layout_C=cutlass_cppgen.LayoutType.ColumnMajor)\n",
|
||||
"\n",
|
||||
" # Create input/output tensors in FP8\n",
|
||||
" A, B = [torch.ones((128, 128)).to(torch.float8_e4m3fn).to(\"cuda\") for _ in range(2)]\n",
|
||||
|
||||
@ -68,7 +68,7 @@
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"import cutlass\n",
|
||||
"import cutlass_cppgen\n",
|
||||
"\n",
|
||||
"# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
|
||||
"# omit this information.\n",
|
||||
@ -112,7 +112,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plan = cutlass.op.Gemm(element=np.float16, layout=cutlass.LayoutType.RowMajor)\n",
|
||||
"plan = cutlass_cppgen.op.Gemm(element=np.float16, layout=cutlass_cppgen.LayoutType.RowMajor)\n",
|
||||
"plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
|
||||
]
|
||||
},
|
||||
|
||||
@ -75,7 +75,7 @@
|
||||
"\n",
|
||||
"## Declaring a grouped GEMM via the CUTLASS Python interface\n",
|
||||
"A grouped GEMM operation is declared similarly to a GEMM operation in the CUTLASS Python interface: one\n",
|
||||
"simply calls `cutlass.op.GroupedGemm`."
|
||||
"simply calls `cutlass_cppgen.op.GroupedGemm`."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -85,11 +85,11 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import cutlass\n",
|
||||
"import cutlass_cppgen\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"dtype = torch.float16\n",
|
||||
"plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor)"
|
||||
"plan = cutlass_cppgen.op.GroupedGemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -174,7 +174,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"op = plan.construct()\n",
|
||||
"grouped_gemm = cutlass.emit.pytorch(op, name='grouped_gemm', cc=plan.cc, sourcedir='out', jit=True)"
|
||||
"grouped_gemm = cutlass_cppgen.emit.pytorch(op, name='grouped_gemm', cc=plan.cc, sourcedir='out', jit=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -182,7 +182,7 @@
|
||||
"id": "c8ca3991",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The `cutlass.emit.pytorch` function emits:\n",
|
||||
"The `cutlass_cppgen.emit.pytorch` function emits:\n",
|
||||
"* `out/grouped_gemm_kernel.cu`: This file contains the declaration of the CUTLASS kernel and a method to call it from PyTorch tensors\n",
|
||||
"* `out/grouped_gemm.cpp`: This file contains a C++ wrapper around the aforementioned CUTLASS kernel\n",
|
||||
"* `setup.py`: This file contains the `setuptools` script for building and installing the generated extension\n",
|
||||
|
||||
@ -62,7 +62,7 @@
|
||||
"import torch\n",
|
||||
"import random\n",
|
||||
"\n",
|
||||
"import cutlass\n",
|
||||
"import cutlass_cppgen\n",
|
||||
"\n",
|
||||
"# This controls whether the C++ GEMM declaration will be printed at each step. \n",
|
||||
"# Set to `false` to omit this information.\n",
|
||||
@ -80,7 +80,7 @@
|
||||
"dilation = (1, 1)\n",
|
||||
"\n",
|
||||
"# Compute the output size [N, P, Q, K]\n",
|
||||
"N, P, Q, K = cutlass.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)\n",
|
||||
"N, P, Q, K = cutlass_cppgen.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)\n",
|
||||
"\n",
|
||||
"dtype = torch.float16\n",
|
||||
"type_A = torch.float16\n",
|
||||
@ -111,7 +111,7 @@
|
||||
"source": [
|
||||
"## Declaring and running a Conv2d Fprop\n",
|
||||
"\n",
|
||||
"We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n",
|
||||
"We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass_cppgen.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n",
|
||||
"\n",
|
||||
"Assuming that we are runing on SM80, the default is a Conv2d that leverages FP16 Tensor Core operations.\n",
|
||||
"\n",
|
||||
@ -125,7 +125,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Specifying `element_accumulator` is not required if it is the same as `element`\n",
|
||||
"plan = cutlass.Conv2dFprop(element=dtype, element_accumulator=torch.float32)\n",
|
||||
"plan = cutlass_cppgen.Conv2dFprop(element=dtype, element_accumulator=torch.float32)\n",
|
||||
"plan.run(input, weight, tensor_C, output, stride, padding, dilation, alpha, beta, print_module=print_module)"
|
||||
]
|
||||
},
|
||||
@ -133,7 +133,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"There are many other ways to construct a plan from `cutlass.op.Conv2dFprop` (e.g., by specifying the types of each operand, by providing representative tensors as input). For more details on these, see the documentation in the `cutlass.op.Conv2dFprop` constructor.\n",
|
||||
"There are many other ways to construct a plan from `cutlass_cppgen.op.Conv2dFprop` (e.g., by specifying the types of each operand, by providing representative tensors as input). For more details on these, see the documentation in the `cutlass_cppgen.op.Conv2dFprop` constructor.\n",
|
||||
"\n",
|
||||
"We then compare the output to running the Conv2d using PyTorch. PyTorch use NCHW layout by default, so permutations are required."
|
||||
]
|
||||
@ -200,7 +200,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plan_dgrad = cutlass.Conv2dDgrad(element=dtype, element_accumulator=torch.float32)\n",
|
||||
"plan_dgrad = cutlass_cppgen.Conv2dDgrad(element=dtype, element_accumulator=torch.float32)\n",
|
||||
"plan_dgrad.run(grad_output, weight, tensor_C_dgrad, grad_input, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
|
||||
"\n",
|
||||
"grad_input_torch = alpha * torch.nn.grad.conv2d_input(\n",
|
||||
@ -225,7 +225,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plan_wgrad = cutlass.Conv2dWgrad(element=dtype, element_accumulator=torch.float32)\n",
|
||||
"plan_wgrad = cutlass_cppgen.Conv2dWgrad(element=dtype, element_accumulator=torch.float32)\n",
|
||||
"plan_wgrad.run(grad_output, input, tensor_C_wgrad, grad_weight, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
|
||||
"\n",
|
||||
"grad_weight_torch = alpha * torch.nn.grad.conv2d_weight(\n",
|
||||
|
||||
@ -67,17 +67,17 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"import cutlass\n",
|
||||
"from cutlass.epilogue import relu\n",
|
||||
"from cutlass import Tensor as FakeTensor\n",
|
||||
"from cutlass.utils.profiler import CUDAEventProfiler\n",
|
||||
"import cutlass_cppgen\n",
|
||||
"from cutlass_cppgen.epilogue import relu\n",
|
||||
"from cutlass_cppgen import Tensor as FakeTensor\n",
|
||||
"from cutlass_cppgen.utils.profiler import CUDAEventProfiler\n",
|
||||
"\n",
|
||||
"# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
|
||||
"# omit this information.\n",
|
||||
"print_module = True\n",
|
||||
"\n",
|
||||
"# The Epilogue Visitor feature currently only works for SM80 and 90\n",
|
||||
"from cutlass.backend.utils.device import device_cc\n",
|
||||
"from cutlass_cppgen.backend.utils.device import device_cc\n",
|
||||
"if device_cc() not in [80, 90]:\n",
|
||||
" import sys\n",
|
||||
" sys.exit()\n",
|
||||
@ -99,7 +99,7 @@
|
||||
"tensor_C = torch.ceil(torch.empty(size=(m, n), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
|
||||
"tensor_D = torch.zeros_like(tensor_C)\n",
|
||||
"\n",
|
||||
"plan = cutlass.op.Gemm(element=torch.float16, layout=cutlass.LayoutType.RowMajor, element_accumulator=torch.float32)"
|
||||
"plan = cutlass_cppgen.op.Gemm(element=torch.float16, layout=cutlass_cppgen.LayoutType.RowMajor, element_accumulator=torch.float32)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -115,7 +115,7 @@
|
||||
"\n",
|
||||
"The example tensors is a dictionary with tensor names as keys and reference tensors as values. The reference tensors can be `float`, `torch.Tensor`, `numpy.ndarray`, or our `FakeTensor`. They provides the shape and data type information of the inputs and outputs of the epilogue.\n",
|
||||
"\n",
|
||||
"The epilogue can be generated simply through `cutlass.evt.trace(<epilogue function>, <example_tensors>)`."
|
||||
"The epilogue can be generated simply through `cutlass_cppgen.evt.trace(<epilogue function>, <example_tensors>)`."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -139,7 +139,7 @@
|
||||
"bias = torch.ceil(torch.empty(size=(m, 1), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
|
||||
"tensor_F = torch.zeros_like(tensor_D)\n",
|
||||
"examples_tensors = {\n",
|
||||
" \"accum\": FakeTensor(element=torch.float32, shape=(m, n), layout_tag=cutlass.LayoutType.RowMajor),\n",
|
||||
" \"accum\": FakeTensor(element=torch.float32, shape=(m, n), layout_tag=cutlass_cppgen.LayoutType.RowMajor),\n",
|
||||
" \"alpha\": alpha,\n",
|
||||
" \"C\": tensor_C,\n",
|
||||
" \"beta\": beta,\n",
|
||||
@ -150,7 +150,7 @@
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Trace the epilogue visitor\n",
|
||||
"epilogue_visitor = cutlass.epilogue.trace(example_epilogue, examples_tensors)"
|
||||
"epilogue_visitor = cutlass_cppgen.epilogue.trace(example_epilogue, examples_tensors)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user