CUTLASS 3.1 (#915)

Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
This commit is contained in:
ANIKET SHIVAM
2023-04-14 20:19:34 -07:00
committed by GitHub
parent 9b8166e3f0
commit d572cc1aab
482 changed files with 37184 additions and 16419 deletions

View File

@ -243,3 +243,4 @@ if (CUTLASS_NVCC_MAX_ARCH GREATER_EQUAL 75)
endif()
endif()

View File

@ -35,8 +35,6 @@
#include <vector>
#include "../../common/cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cutlass/layout/matrix.h"
#include "cutlass/conv/convolution.h"

View File

@ -573,7 +573,7 @@ bool TestSpecificConv2d(
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// (conv_blacklist_sizes)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename ImplicitGemm>

View File

@ -410,6 +410,7 @@ public:
LayoutC,
ElementCompute,
ElementAccumulator,
ElementC,
cutlass::NumericConverterClamp<ElementC, ElementCompute>
>(
kConvolutionalOperator,
@ -517,7 +518,7 @@ public:
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// (conv_blacklist_sizes)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename ImplicitGemm, int InterleavedK>

View File

@ -502,7 +502,7 @@ public:
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// (conv_blacklist_sizes)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename ImplicitGemm,

View File

@ -464,7 +464,7 @@ public:
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// (conv_blacklist_sizes)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
template <typename ImplicitGemm>

View File

@ -522,7 +522,7 @@ public:
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
// Additionally, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// Additionally, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
// (conv_blacklist_sizes)
/////////////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -241,6 +241,106 @@ TEST(SM80_Device_Conv2d_Group_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhw
////////////////////////////////////////////////////////////////////////////////
// Analytic 2 stage SingleGroup kernel
TEST(SM80_Device_Conv2d_Group_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
SingleGroupPerCTA_128x128_64x2_64x64x64) {
/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 64>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
/// Device-level Conv2d instance
using Conv2dGroupFpropKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape,
WarpShape,
InstructionShape,
cutlass::epilogue::thread::LinearCombination<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::GroupMode::kSingleGroup,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using Conv2dGroupFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dGroupFpropKernel>;
/// Run group conv unit test sizes with device-level Conv2d instance
test::conv::device::TestbedGroupConv2dProblemSizes problem_sizes(
ThreadblockShape::kN, ThreadblockShape::kK,
128/cutlass::sizeof_bits<ElementA>::value
);
EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dGroupFprop>(problem_sizes.default_single_group_sizes));
}
////////////////////////////////////////////////////////////////////////////////
// Analytic 2 stage MutipleGroup kernel
TEST(SM80_Device_Conv2d_Group_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
MutipleGroupPerCTA_64x64_64x2_32x32x64) {
/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = cutlass::half_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>;
using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>;
/// Device-level Conv2d instance
using Conv2dGroupFpropKernel = typename cutlass::conv::kernel::DefaultConv2dGroupFprop<
ElementA, cutlass::layout::TensorNHWC,
ElementB, cutlass::layout::TensorNHWC,
ElementC, cutlass::layout::TensorNHWC,
ElementAccumulator,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm80,
ThreadblockShape,
WarpShape,
InstructionShape,
cutlass::epilogue::thread::LinearCombination<
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
2,
cutlass::arch::OpMultiplyAdd,
cutlass::conv::GroupMode::kMultipleGroup,
cutlass::conv::IteratorAlgorithm::kAnalytic
>::Kernel;
using Conv2dGroupFprop = cutlass::conv::device::ImplicitGemmConvolution<Conv2dGroupFpropKernel>;
/// Run group conv unit test sizes with device-level Conv2d instance
test::conv::device::TestbedGroupConv2dProblemSizes problem_sizes(
ThreadblockShape::kN, ThreadblockShape::kK,
128/cutlass::sizeof_bits<ElementA>::value
);
EXPECT_TRUE(test::conv::device::TestSpecificConv2d<Conv2dGroupFprop>(problem_sizes.default_multiple_group_sizes));
}
////////////////////////////////////////////////////////////////////////////////
TEST(SM80_Device_Conv2d_Group_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
SingleGroupPerCTA_128x128_64x3_64x64x64) {
@ -340,14 +440,14 @@ TEST(SM80_Device_Conv2d_Group_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nh
////////////////////////////////////////////////////////////////////////////////
// Optimized 2 stage singleGroup kernel
// Optimized 2 stage SingleGroup kernel
TEST(SM80_Device_Conv2d_Group_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32,
SingleGroupPerCTA_64x64_64x2_32x32x64) {
/// Conv operation element types for the Gemm equivalent (ImplicitGemm)
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
using ElementC = float;
using ElementC = cutlass::half_t;
using ElementAccumulator = float;
using ElementCompute = float;
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 64>;