CUTLASS 2.6 (#298)

CUTLASS 2.6
This commit is contained in:
Manish Gupta
2021-07-22 21:40:53 -07:00
committed by GitHub
parent 6c29fe20ba
commit e5d51840e8
308 changed files with 32408 additions and 4722 deletions

View File

@ -119,12 +119,12 @@ cudaError_t cutlass_hgemm_nn(
int K,
cutlass::half_t alpha,
cutlass::half_t const *A,
int lda,
cutlass::layout::ColumnMajor::Stride::Index lda,
cutlass::half_t const *B,
int ldb,
cutlass::layout::ColumnMajor::Stride::Index ldb,
cutlass::half_t beta,
cutlass::half_t *C,
int ldc) {
cutlass::layout::ColumnMajor::Stride::Index ldc) {
// Define the GEMM operation
using Gemm = cutlass::gemm::device::Gemm<

View File

@ -67,7 +67,7 @@ beta * C).
Now that we setup the properties of data, we have to setup properties of computation.
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
64x64x4, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from

View File

@ -275,10 +275,10 @@ public:
int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
int lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
int ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
int ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
int ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();

View File

@ -292,10 +292,11 @@ public:
int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
int lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
int ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
int ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
int ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();

View File

@ -48,6 +48,10 @@ addition to its own input activation tile. Therefore the input activation warp t
2nd GEMM/Conv only depends on the output warp accumulator of the 1st GEMM/Conv in the
register file, and the operation can be fully register-file-resident.
When applying the above constraint to convolutions, it is required that the 2nd Convolution
kernel doesn't have halos such that data used by each threadblock doesn't depend on any other
threadblock. Typically this requires the 2nd Convolution uses 1x1 filter without any paddings.
# Copyright
Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.

View File

@ -36,8 +36,6 @@
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_conv2d_run.h"
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_0 (
@ -57,7 +55,7 @@ cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_1 (
{128, 56, 56, 64} // output size (NPQK)
);
void run_nonfused_conv2d_fprop_f16_sm75() {
bool run_nonfused_conv2d_fprop_f16_sm75() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
@ -90,7 +88,8 @@ void run_nonfused_conv2d_fprop_f16_sm75() {
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
@ -135,9 +134,10 @@ void run_nonfused_conv2d_fprop_f16_sm75() {
else
std::cout << "Fail\n";
return pass;
}
void run_fused_conv2d_fprop_f16_sm75() {
bool run_fused_conv2d_fprop_f16_sm75() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
@ -161,7 +161,8 @@ void run_fused_conv2d_fprop_f16_sm75() {
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -207,9 +208,10 @@ void run_fused_conv2d_fprop_f16_sm75() {
else
std::cout << "Fail\n";
return pass;
}
void run_nonfused_conv2d_fprop_optimized_f16_sm75() {
bool run_nonfused_conv2d_fprop_optimized_f16_sm75() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
@ -242,7 +244,8 @@ void run_nonfused_conv2d_fprop_optimized_f16_sm75() {
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
@ -287,9 +290,10 @@ void run_nonfused_conv2d_fprop_optimized_f16_sm75() {
else
std::cout << "Fail\n";
return pass;
}
void run_fused_conv2d_fprop_optimized_f16_sm75() {
bool run_fused_conv2d_fprop_optimized_f16_sm75() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
@ -313,7 +317,8 @@ void run_fused_conv2d_fprop_optimized_f16_sm75() {
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -359,10 +364,8 @@ void run_fused_conv2d_fprop_optimized_f16_sm75() {
else
std::cout << "Fail\n";
return pass;
}
////////////////////////////////////////////////////////////////////////////////
#endif // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)

View File

@ -36,8 +36,6 @@
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_conv2d_run.h"
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_0 (
@ -57,7 +55,7 @@ cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_1 (
{128, 56, 56, 64} // output size (NPQK)
);
void run_nonfused_conv2d_fprop_f16_sm80() {
bool run_nonfused_conv2d_fprop_f16_sm80() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
@ -90,7 +88,8 @@ void run_nonfused_conv2d_fprop_f16_sm80() {
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
@ -135,9 +134,10 @@ void run_nonfused_conv2d_fprop_f16_sm80() {
else
std::cout << "Fail\n";
return pass;
}
void run_fused_conv2d_fprop_f16_sm80() {
bool run_fused_conv2d_fprop_f16_sm80() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
@ -161,7 +161,8 @@ void run_fused_conv2d_fprop_f16_sm80() {
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -205,9 +206,10 @@ void run_fused_conv2d_fprop_f16_sm80() {
else
std::cout << "Fail\n";
return pass;
}
void run_nonfused_conv2d_fprop_optimized_f16_sm80() {
bool run_nonfused_conv2d_fprop_optimized_f16_sm80() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
@ -240,7 +242,8 @@ void run_nonfused_conv2d_fprop_optimized_f16_sm80() {
ElementC,
128 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
@ -285,9 +288,10 @@ void run_nonfused_conv2d_fprop_optimized_f16_sm80() {
else
std::cout << "Fail\n";
return pass;
}
void run_fused_conv2d_fprop_optimized_f16_sm80() {
bool run_fused_conv2d_fprop_optimized_f16_sm80() {
using ElementA = cutlass::half_t;
using ElementB = cutlass::half_t;
@ -311,7 +315,8 @@ void run_fused_conv2d_fprop_optimized_f16_sm80() {
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -355,9 +360,8 @@ void run_fused_conv2d_fprop_optimized_f16_sm80() {
else
std::cout << "Fail\n";
return pass;
}
////////////////////////////////////////////////////////////////////////////////
#endif // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

View File

@ -36,8 +36,6 @@
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_interleaved_conv2d_run.h"
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
@ -57,7 +55,7 @@ cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_1 (
{128, 56, 56, 64} // output size (NPQK)
);
void run_nonfused_conv2d_fprop_s8_sm75() {
bool run_nonfused_conv2d_fprop_s8_sm75() {
using ElementA = int8_t;
using ElementB = int8_t;
@ -90,7 +88,8 @@ void run_nonfused_conv2d_fprop_s8_sm75() {
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
@ -135,9 +134,10 @@ void run_nonfused_conv2d_fprop_s8_sm75() {
else
std::cout << "Fail\n";
return pass;
}
void run_fused_conv2d_fprop_s8_sm75() {
bool run_fused_conv2d_fprop_s8_sm75() {
using ElementA = int8_t;
using ElementB = int8_t;
@ -161,7 +161,8 @@ void run_fused_conv2d_fprop_s8_sm75() {
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -207,9 +208,10 @@ void run_fused_conv2d_fprop_s8_sm75() {
else
std::cout << "Fail\n";
return pass;
}
void run_nonfused_conv2d_fprop_optimized_s8_sm75() {
bool run_nonfused_conv2d_fprop_optimized_s8_sm75() {
using ElementA = int8_t;
using ElementB = int8_t;
@ -242,7 +244,8 @@ void run_nonfused_conv2d_fprop_optimized_s8_sm75() {
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2,
@ -287,9 +290,10 @@ void run_nonfused_conv2d_fprop_optimized_s8_sm75() {
else
std::cout << "Fail\n";
return pass;
}
void run_fused_conv2d_fprop_optimized_s8_sm75() {
bool run_fused_conv2d_fprop_optimized_s8_sm75() {
using ElementA = int8_t;
using ElementB = int8_t;
@ -313,7 +317,8 @@ void run_fused_conv2d_fprop_optimized_s8_sm75() {
ElementC,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -359,9 +364,8 @@ void run_fused_conv2d_fprop_optimized_s8_sm75() {
else
std::cout << "Fail\n";
return pass;
}
////////////////////////////////////////////////////////////////////////////////
#endif // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)

View File

@ -36,8 +36,6 @@
#include "device/b2b_implicit_gemm_convolution.h"
#include "b2b_interleaved_conv2d_run.h"
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_0 (
@ -57,7 +55,7 @@ cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_1 (
{128, 56, 56, 64} // output size (NPQK)
);
void run_nonfused_conv2d_fprop_s8_sm80() {
bool run_nonfused_conv2d_fprop_s8_sm80() {
using ElementA = int8_t;
using ElementB = int8_t;
@ -90,7 +88,8 @@ void run_nonfused_conv2d_fprop_s8_sm80() {
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
@ -135,9 +134,10 @@ void run_nonfused_conv2d_fprop_s8_sm80() {
else
std::cout << "Fail\n";
return pass;
}
void run_fused_conv2d_fprop_s8_sm80() {
bool run_fused_conv2d_fprop_s8_sm80() {
using ElementA = int8_t;
using ElementB = int8_t;
@ -161,7 +161,8 @@ void run_fused_conv2d_fprop_s8_sm80() {
ElementC,
8 * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -207,9 +208,10 @@ void run_fused_conv2d_fprop_s8_sm80() {
else
std::cout << "Fail\n";
return pass;
}
void run_nonfused_conv2d_fprop_optimized_s8_sm80() {
bool run_nonfused_conv2d_fprop_optimized_s8_sm80() {
using ElementA = int8_t;
using ElementB = int8_t;
@ -242,7 +244,8 @@ void run_nonfused_conv2d_fprop_optimized_s8_sm80() {
ElementC,
64 / cutlass::sizeof_bits<ElementC>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3,
@ -287,9 +290,10 @@ void run_nonfused_conv2d_fprop_optimized_s8_sm80() {
else
std::cout << "Fail\n";
return pass;
}
void run_fused_conv2d_fprop_optimized_s8_sm80() {
bool run_fused_conv2d_fprop_optimized_s8_sm80() {
using ElementA = int8_t;
using ElementB = int8_t;
@ -313,7 +317,8 @@ void run_fused_conv2d_fprop_optimized_s8_sm80() {
ElementC,
8 * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -359,10 +364,9 @@ void run_fused_conv2d_fprop_optimized_s8_sm80() {
else
std::cout << "Fail\n";
return pass;
}
////////////////////////////////////////////////////////////////////////////////
#endif // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

View File

@ -39,14 +39,12 @@
#include "device/b2b_gemm.h"
#include "b2b_gemm_run.h"
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_1(128*1600, 128, 64);
void run_nonfused_gemm_f16() {
bool run_nonfused_gemm_f16() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
@ -80,7 +78,8 @@ void run_nonfused_gemm_f16() {
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2
@ -116,9 +115,11 @@ void run_nonfused_gemm_f16() {
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
void run_fused_gemm_f16() {
bool run_fused_gemm_f16() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
@ -140,7 +141,8 @@ void run_fused_gemm_f16() {
ElementOutput,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -183,7 +185,6 @@ void run_fused_gemm_f16() {
else
std::cout << "Fail\n";
return passed;
}
////////////////////////////////////////////////////////////////////////////////
#endif //#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)

View File

@ -39,14 +39,12 @@
#include "device/b2b_gemm.h"
#include "b2b_gemm_run.h"
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_1(128*1600, 128, 64);
void run_nonfused_gemm_f16_sm80() {
bool run_nonfused_gemm_f16_sm80() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
@ -80,7 +78,8 @@ void run_nonfused_gemm_f16_sm80() {
ElementOutput,
128 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
3
@ -116,9 +115,11 @@ void run_nonfused_gemm_f16_sm80() {
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
void run_fused_gemm_f16_sm80() {
bool run_fused_gemm_f16_sm80() {
using ElementOutput = cutlass::half_t;
using ElementAccumulator = cutlass::half_t;
@ -140,7 +141,8 @@ void run_fused_gemm_f16_sm80() {
ElementOutput,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -183,7 +185,7 @@ void run_fused_gemm_f16_sm80() {
else
std::cout << "Fail\n";
return passed;
}
////////////////////////////////////////////////////////////////////////////////
#endif //#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

View File

@ -39,14 +39,12 @@
#include "device/b2b_gemm.h"
#include "b2b_interleaved_gemm_run.h"
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_1(128*1600, 128, 64);
void run_nonfused_gemm_s8() {
bool run_nonfused_gemm_s8() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
@ -80,7 +78,8 @@ void run_nonfused_gemm_s8() {
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
2
@ -116,9 +115,11 @@ void run_nonfused_gemm_s8() {
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
void run_fused_gemm_s8() {
bool run_fused_gemm_s8() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
@ -140,7 +141,8 @@ void run_fused_gemm_s8() {
ElementOutput,
InstructionShape::kM * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -151,8 +153,6 @@ void run_fused_gemm_s8() {
ElementCompute
>;
using B2bGemm = cutlass::gemm::device::B2bGemm<
int8_t,
cutlass::layout::ColumnMajorInterleaved<32>,
@ -183,7 +183,7 @@ void run_fused_gemm_s8() {
else
std::cout << "Fail\n";
return passed;
}
////////////////////////////////////////////////////////////////////////////////
#endif // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)

View File

@ -39,14 +39,12 @@
#include "device/b2b_gemm.h"
#include "b2b_interleaved_gemm_run.h"
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
////////////////////////////////////////////////////////////////////////////////
cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(128*1600, 64, 576);
cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(128*1600, 128, 64);
void run_nonfused_gemm_s8_sm80() {
bool run_nonfused_gemm_s8_sm80() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
@ -80,7 +78,8 @@ void run_nonfused_gemm_s8_sm80() {
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
3,
@ -106,7 +105,8 @@ void run_nonfused_gemm_s8_sm80() {
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
3,
@ -124,9 +124,11 @@ void run_nonfused_gemm_s8_sm80() {
std::cout << "Pass\n";
else
std::cout << "Fail\n";
return pass;
}
void run_fused_gemm_s8_sm80() {
bool run_fused_gemm_s8_sm80() {
using ElementOutput = int8_t;
using ElementAccumulator = int32_t;
@ -148,7 +150,8 @@ void run_fused_gemm_s8_sm80() {
ElementOutput,
8 * InstructionShape::kN / 32,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using EpilogueOutputOp1 =
@ -156,11 +159,10 @@ void run_fused_gemm_s8_sm80() {
ElementOutput,
64 / cutlass::sizeof_bits<ElementOutput>::value,
ElementAccumulator,
ElementCompute
ElementCompute,
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
>;
using B2bGemm = cutlass::gemm::device::B2bGemm<
int8_t,
cutlass::layout::ColumnMajorInterleaved<32>,
@ -183,8 +185,7 @@ void run_fused_gemm_s8_sm80() {
16,
16,
false,
cutlass::arch::OpMultiplyAddSaturate,
true
cutlass::arch::OpMultiplyAddSaturate
>;
B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
@ -196,7 +197,6 @@ void run_fused_gemm_s8_sm80() {
else
std::cout << "Fail\n";
return passed;
}
////////////////////////////////////////////////////////////////////////////////
#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)

View File

@ -115,9 +115,7 @@ template <
/// Operation performed by GEMM
typename Operator_ = typename DefaultGemmConfiguration<
OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
ElementAccumulator_>::Operator,
/// Whether Beta is zero or not
bool IsBetaZero = false>
ElementAccumulator_>::Operator>
class B2bGemm {
public:
@ -148,7 +146,6 @@ class B2bGemm {
static int const kAlignmentB = AlignmentB;
static int const kAlignmentC = EpilogueOutputOp1::kCount;
static bool const kSplitKSerial = SplitKSerial;
static bool const kIsBetaZero = IsBetaZero;
static ComplexTransform const kTransformA = ComplexTransform::kNone;
static ComplexTransform const kTransformB = ComplexTransform::kNone;
@ -175,8 +172,7 @@ class B2bGemm {
ThreadblockSwizzle,
kStages,
kSplitKSerial,
Operator,
kIsBetaZero
Operator
>::B2bGemmKernel;
/// Argument structure
@ -422,7 +418,7 @@ public:
void *workspace = nullptr,
cudaStream_t stream = nullptr) {
Status status = initialize(args, workspace);
Status status = initialize(args, workspace, stream);
if (status == Status::kSuccess) {
status = run(stream);

View File

@ -255,7 +255,7 @@ public:
void *workspace = nullptr,
cudaStream_t stream = nullptr) {
Status status = initialize(args, workspace);
Status status = initialize(args, workspace, stream);
if (status == Status::kSuccess) {
status = run(stream);

View File

@ -28,53 +28,14 @@
#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h"
#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h"
int run() {
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (!(props.major * 10 + props.minor >= 75)) {
std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
<< std::endl;
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
std::cout << "Running on SM80" << std::endl;
run_nonfused_conv2d_fprop_optimized_f16_sm80();
run_fused_conv2d_fprop_optimized_f16_sm80();
run_nonfused_conv2d_fprop_optimized_s8_sm80();
run_fused_conv2d_fprop_optimized_s8_sm80();
#elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
std::cout << "Running on SM75" << std::endl;
run_nonfused_conv2d_fprop_optimized_f16_sm75();
run_fused_conv2d_fprop_optimized_f16_sm75();
run_nonfused_conv2d_fprop_optimized_s8_sm75();
run_fused_conv2d_fprop_optimized_s8_sm75();
#endif
return 0;
}
int main() {
int run_sm75() {
bool notSupported = false;
// Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
//
// CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
notSupported = true;
}
cudaDeviceProp props;
@ -85,10 +46,7 @@ int main() {
return -1;
}
if (!(props.major * 10 + props.minor >= 75)) {
std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75."
<< std::endl;
if (!(props.major == 7 && props.minor >= 5)) {
notSupported = true;
}
@ -96,7 +54,83 @@ int main() {
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
return run();
bool pass = 1;
std::cout << "Running on SM75" << std::endl;
pass &= run_nonfused_conv2d_fprop_optimized_f16_sm75();
pass &= run_fused_conv2d_fprop_optimized_f16_sm75();
pass &= run_nonfused_conv2d_fprop_optimized_s8_sm75();
pass &= run_fused_conv2d_fprop_optimized_s8_sm75();
if(pass)
return 1;
else
return -1;
}
int run_sm80() {
bool notSupported = false;
// Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
//
// CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
notSupported = true;
}
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (!(props.major == 8 && props.minor >= 0)) {
notSupported = true;
}
if (notSupported) {
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
bool pass = 1;
std::cout << "Running on SM80" << std::endl;
pass &= run_nonfused_conv2d_fprop_optimized_f16_sm80();
pass &= run_fused_conv2d_fprop_optimized_f16_sm80();
pass &= run_nonfused_conv2d_fprop_optimized_s8_sm80();
pass &= run_fused_conv2d_fprop_optimized_s8_sm80();
if(pass)
return 1;
else
return -1;
}
int main() {
int result = 0;
result = run_sm80();
if(!result) { // not supported
result = run_sm75();
if(!result) {
std::cout << "This example isn't supported on current architecture" << std::endl;
}
}
if(result >= 0)
return 0;
else
return -1;
}

View File

@ -28,36 +28,15 @@
#include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h"
#include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h"
int run() {
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
std::cout << "Running on SM80" << std::endl;
run_nonfused_gemm_f16_sm80();
run_fused_gemm_f16_sm80();
run_nonfused_gemm_s8_sm80();
run_fused_gemm_s8_sm80();
#elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
std::cout << "Running on SM75" << std::endl;
run_nonfused_gemm_f16();
run_fused_gemm_f16();
run_nonfused_gemm_s8();
run_fused_gemm_s8();
#endif
return 0;
}
int main() {
int run_sm75() {
bool notSupported = false;
// Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
//
// CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
notSupported = true;
}
cudaDeviceProp props;
@ -68,10 +47,7 @@ int main() {
return -1;
}
if (!(props.major * 10 + props.minor >= 75)) {
std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75."
<< std::endl;
if (!(props.major == 7 && props.minor >= 5)) {
notSupported = true;
}
@ -80,6 +56,86 @@ int main() {
return 0;
}
return run();
bool pass = true;
std::cout << "Running on SM75" << std::endl;
pass &= run_nonfused_gemm_f16();
pass &= run_fused_gemm_f16();
pass &= run_nonfused_gemm_s8();
pass &= run_fused_gemm_s8();
if(pass)
return 1;
else
return -1;
}
int run_sm80() {
bool notSupported = false;
// Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
//
// CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
notSupported = true;
}
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (!(props.major == 8 && props.minor >= 0)) {
notSupported = true;
}
if (notSupported) {
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
return 0;
}
bool pass = true;
std::cout << "Running on SM80" << std::endl;
pass &= run_nonfused_gemm_f16_sm80();
pass &= run_fused_gemm_f16_sm80();
pass &= run_nonfused_gemm_s8_sm80();
pass &= run_fused_gemm_s8_sm80();
if(pass)
return 1;
else
return -1;
}
int main() {
int result = 0;
result = run_sm80();
if(!result) { // not supported
result = run_sm75();
if(!result) {
std::cout << "This example isn't supported on current architecture" << std::endl;
}
}
if(result >= 0)
return 0;
else
return -1;
}

View File

@ -66,6 +66,7 @@ struct B2bGemm {
cutlass::gemm::GemmCoord problem_size_0;
cutlass::gemm::GemmCoord problem_size_1;
cutlass::gemm::GemmCoord grid_tiled_shape;
int swizzle_log_tile;
typename B2bMma::IteratorA0::Params params_A0;
typename B2bMma::IteratorA0::TensorRef ref_A0;
typename B2bMma::IteratorB0::Params params_B0;
@ -91,7 +92,7 @@ struct B2bGemm {
//
CUTLASS_HOST_DEVICE
Params(): semaphore(0), gemm_k_iterations_0(0), gemm_k_size_0(0),
Params(): swizzle_log_tile(0), semaphore(0), gemm_k_iterations_0(0), gemm_k_size_0(0),
gemm_k_iterations_1(0), gemm_k_size_1(0) { }
CUTLASS_HOST_DEVICE
@ -112,6 +113,7 @@ struct B2bGemm {
problem_size_0(problem_size_0),
problem_size_1(problem_size_1),
grid_tiled_shape(grid_tiled_shape),
swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
params_A0(ref_A0.layout()),
ref_A0(ref_A0),
params_B0(ref_B0.layout()),
@ -211,7 +213,7 @@ struct B2bGemm {
ThreadblockSwizzle threadblock_swizzle;
cutlass::gemm::GemmCoord threadblock_tile_offset =
threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
// Early exit if CTA is out of range
if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
@ -315,7 +317,7 @@ struct B2bGemm {
//
threadblock_tile_offset =
threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
//assume identity swizzle
MatrixCoord threadblock_offset(

View File

@ -209,6 +209,7 @@ struct B2bImplicitGemmConvolution {
cutlass::gemm::GemmCoord grid_tiled_shape;
gemm::GemmCoord implicit_gemm_problem_size_0;
gemm::GemmCoord implicit_gemm_problem_size_1;
int swizzle_log_tile;
int gemm_k_iterations_0;
int gemm_k_iterations_1;
typename B2bMma::IteratorA0::Params iterator_A0;
@ -233,7 +234,7 @@ struct B2bImplicitGemmConvolution {
//
CUTLASS_HOST_DEVICE
Params(): gemm_k_iterations_0(0), gemm_k_iterations_1(0) { }
Params(): swizzle_log_tile(0), gemm_k_iterations_0(0), gemm_k_iterations_1(0) { }
///
CUTLASS_HOST_DEVICE
@ -245,7 +246,6 @@ struct B2bImplicitGemmConvolution {
problem_size_1(args.problem_size_1),
implicit_gemm_problem_size_0(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0)),
implicit_gemm_problem_size_1(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_1)),
grid_tiled_shape(grid_tiled_shape),
iterator_A0(B2bMma::IteratorA0::getParams(args.problem_size_0, args.ref_A0.layout())),
ptr_A0(args.ref_A0.data()),
iterator_B0(args.problem_size_0, args.ref_B0.layout()),
@ -272,6 +272,8 @@ struct B2bImplicitGemmConvolution {
implicit_gemm_problem_size_0,
{ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
args.problem_size_0.split_k_slices);
swizzle_log_tile = ThreadblockSwizzle().get_log_tile(grid_tiled_shape);
}
};
@ -296,7 +298,7 @@ struct B2bImplicitGemmConvolution {
ThreadblockSwizzle threadblock_swizzle;
cutlass::gemm::GemmCoord threadblock_tile_idx =
threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
// Early exit if CTA is out of range
if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
@ -379,7 +381,7 @@ struct B2bImplicitGemmConvolution {
// Compute logical position within grid
threadblock_tile_idx =
threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
// If performing a reduction via split-K, fetch the initial synchronization
if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {

View File

@ -111,9 +111,7 @@ template <
/// If true, kernel is configured to support serial reduction in the epilogue
bool SplitKSerial,
/// Operation performed by GEMM
typename Operator,
/// Beta is zero or not
bool IsBetaZero = false
typename Operator
>
struct DefaultB2bGemm;
@ -321,9 +319,7 @@ template <
/// epilogue
bool SplitKSerial,
/// Operation performed by GEMM
typename Operator,
/// Is Beta zero or not
bool IsBetaZero>
typename Operator>
struct DefaultB2bGemm<
ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
@ -332,7 +328,7 @@ struct DefaultB2bGemm<
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
ThreadblockSwizzle, Stages,
SplitKSerial, Operator, IsBetaZero> {
SplitKSerial, Operator> {
using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
@ -353,8 +349,7 @@ struct DefaultB2bGemm<
using Epilogue = typename cutlass::epilogue::threadblock::
DefaultInterleavedEpilogueTensorOp<
ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
64 / sizeof_bits<ElementC>::value, InterleavedK,
IsBetaZero>::Epilogue;
64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
/// Define the kernel-level GEMM operator.
using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
@ -397,9 +392,7 @@ template <
/// epilogue
bool SplitKSerial,
/// Operation performed by GEMM
typename Operator,
/// Is Beta zero or not
bool IsBetaZero>
typename Operator>
struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
kAlignmentA, ElementB,
layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
@ -407,7 +400,7 @@ struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
int32_t, arch::OpClassTensorOp, arch::Sm75,
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
ThreadblockSwizzle, 2, SplitKSerial, Operator, IsBetaZero> {
ThreadblockSwizzle, 2, SplitKSerial, Operator> {
using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
@ -426,8 +419,7 @@ struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
using Epilogue = typename cutlass::epilogue::threadblock::
DefaultInterleavedEpilogueTensorOp<
ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
64 / sizeof_bits<ElementC>::value, InterleavedK,
IsBetaZero>::Epilogue;
64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
/// Define the kernel-level GEMM operator.
using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;

View File

@ -43,14 +43,122 @@ fp32 data by using NVIDIA Ampere architecture.
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/util/command_line.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/tensor_view_io.h"
#include "helper.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Result structure
struct Result {
double runtime_ms;
double gflops;
cutlass::Status status;
cudaError_t error;
bool passed;
//
// Methods
//
Result(
double runtime_ms = 0,
double gflops = 0,
cutlass::Status status = cutlass::Status::kSuccess,
cudaError_t error = cudaSuccess
):
runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct Options {
bool help;
cutlass::gemm::GemmCoord problem_size;
int batch_count;
float alpha;
float beta;
bool reference_check;
int iterations;
Options():
help(false),
problem_size({5120, 4096, 4096}),
batch_count(1),
reference_check(true),
iterations(20),
alpha(1),
beta() { }
bool valid() {
return true;
}
// Parses the command line
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
}
cmd.get_cmd_line_argument("m", problem_size.m());
cmd.get_cmd_line_argument("n", problem_size.n());
cmd.get_cmd_line_argument("k", problem_size.k());
cmd.get_cmd_line_argument("alpha", alpha);
cmd.get_cmd_line_argument("beta", beta);
cmd.get_cmd_line_argument("iterations", iterations);
}
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "14_ampere_tf32_tensorop_gemm example\n\n"
<< " This example uses the CUTLASS Library to execute TF32 tensorop GEMM computations.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement.\n\n"
<< " --m <int> GEMM M dimension\n"
<< " --n <int> GEMM N dimension\n"
<< " --k <int> GEMM K dimension\n"
<< " --alpha <f32> Epilogue scalar alpha\n"
<< " --beta <f32> Epilogue scalar beta\n\n"
<< " --iterations <int> Number of profiling iterations to perform.\n\n";
out << "\n\nExamples:\n\n"
<< "$ ./examples/14_ampere_tf32_tensorop_gemm/14_ampere_tf32_tensorop_gemm --m=1024 --n=512 --k=1024 \\\n"
<< " --alpha=2 --beta=0.707 \n\n";
return out;
}
/// Compute performance in GFLOP/s
double gflops(double runtime_s) const {
// Number of real-valued multiply-adds
int64_t fmas = problem_size.product() * batch_count;
// Two flops per multiply-add
return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// The code section below describes datatype for input, output matrices and computation between
// elements in input matrices.
using ElementAccumulator = float; // <- data type of accumulator
@ -111,14 +219,10 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
SwizzleThreadBlock,
NumStages>;
int run() {
const int length_m = 5120;
const int length_n = 4096;
const int length_k = 4096;
int run(Options &options) {
// Create a tuple of problem size for matrix multiplication
cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
cutlass::gemm::GemmCoord problem_size = options.problem_size;
// Initialize tensors using CUTLASS helper functions
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
@ -166,8 +270,8 @@ int run() {
tensor_ref_d.sync_device();
// Initialize alpha and beta for dot product computation
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
ElementComputeEpilogue beta = ElementComputeEpilogue(0);
ElementComputeEpilogue alpha = ElementComputeEpilogue(options.alpha);
ElementComputeEpilogue beta = ElementComputeEpilogue(options.beta);
// Split K dimension into 1 partitions
int split_k_slices = 1;
@ -199,9 +303,74 @@ int run() {
status = gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Launch initialized CUTLASS kernel
status = gemm_op();
CUTLASS_CHECK(status);
// Result structure
Result result;
//
// Construct events
//
cudaEvent_t events[2];
for (auto & event : events) {
result.error = cudaEventCreate(&event);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
return -1;
}
}
// Record an event at the start of a series of GEMMs
result.error = cudaEventRecord(events[0]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
return -1;
}
//
// Run profiling loop
//
for (int iter = 0; iter < options.iterations; ++iter) {
// Launch initialized CUTLASS kernel
status = gemm_op();
CUTLASS_CHECK(status);
}
//
// Stop profiling loop
//
// Record an event when the GEMMs are complete
result.error = cudaEventRecord(events[1]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
return -1;
}
// Wait for work on the device to complete.
result.error = cudaEventSynchronize(events[1]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
return -1;
}
// Measure elapsed runtime
float runtime_ms = 0;
result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
return -1;
}
// Compute average runtime and GFLOPs.
result.runtime_ms = double(runtime_ms) / double(options.iterations);
result.gflops = options.gflops(result.runtime_ms / 1000.0);
// Cleanup
for (auto event : events) {
(void)cudaEventDestroy(event);
}
// Create instantiation for device reference gemm kernel
cutlass::reference::device::Gemm<ElementInputA,
@ -235,12 +404,17 @@ int run() {
tensor_d.host_view(),
tensor_ref_d.host_view());
if (passed) {
std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
std::cout << " GFLOPs: " << result.gflops << std::endl;
}
std::cout << (passed ? "Passed" : "Failed") << std::endl;
return (passed ? 0 : -1);
}
int main() {
int main(int argc, const char **argv) {
bool notSupported = false;
@ -272,5 +446,21 @@ int main() {
return 0;
}
return run();
Options options;
options.parse(argc, argv);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
printf("%d x %d x %d TF32 tensor op Matrix Multiply\n", \
options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
if (!options.valid()) {
std::cerr << "Invalid problem." << std::endl;
return -1;
}
return run(options);
}

View File

@ -152,7 +152,7 @@ int run() {
cutlass::HostTensor<ElementInputE, LayoutInputE> tensor_e(
cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse / kElementsPerElementE));
// Same size as the above. The above one needs to be reordered and stored in this one.
cutlass::HostTensor<ElementInputE, ReorderedLayoutInputE> tensor_e_reordered(
cutlass::HostTensor<ElementInputE, ReorderedLayoutInputE> tensor_e_reordered(
cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse / kElementsPerElementE));
// Fill input and output matrices on host using CUTLASS helper functions

View File

@ -158,7 +158,7 @@ using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSw
constexpr int NumStages = 3;
// This code section describe iterator algorithm selected is Analytic or Optimized
static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kAnalytic;
static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized;
// This code section describes the epilogue part of the kernel, we use default value
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
@ -189,7 +189,6 @@ using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
@ -755,6 +754,3 @@ int main(int argc, char const **args) {
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,28 @@
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
18_ampere_fp64_tensorop_affine2_gemm
ampere_fp64_tensorop_affine2_gemm.cu
)

View File

@ -0,0 +1,336 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/**
In the normal GEMM, the fast changing dimension of a matrix always has stride
equals to 1, e.g. ColumnMajor and RowMajor matrix. Affine2 matrix can have
larger than 1 stride in both dimensions. To support such layout, we need to
change to method to visit the global memory:
1. We can only visit 1 element a time because elements are not stored
consecutively anymore. Vectorized load/store is not possible.
2. One extra multiplication is needed in calculating the global memory
address
addr = base_pointer + coord1 * stride1 + coord2 * stride2
The rest part of GEMM which includes shared memory load/store, mma comutation
is the same.
This example uses Ampere fp64 tensore core Affine2 GEMM as an example. SIMT
(e.g. sgemm, dgemm) has support Affine2 layout.
*/
#include <iostream>
#include <sstream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/default_gemm_with_k_reduction.h"
#include "cutlass/reduction/device/reduce_split_k.h"
#include "cutlass/reduction/kernel/reduce_split_k.h"
#include "cutlass/reduction/thread/reduction_operators.h"
#include "cutlass/matrix_coord.h"
#include "cutlass/util/command_line.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "helper.h"
// The code section below describes datatype for input, output tensors and computation between
// elements
using ElementAccumulator = double; // Data type of accumulator
using ElementComputeEpilogue = ElementAccumulator; // Data type of epilogue computation
using ElementInputA = double; // Data type of elements in input tensor
using ElementInputB = double; // Data type of elements in input tensor
using ElementOutput = double; // Data type of elements in output tensor
// Since Affine2 explicitly lists the strides of both dimensions, it does not really matter if
// it is columnmajor and rowmajor. However, it helps CUTLASS to improve the load locality if
// CUTLASS can know which dimension of A/B operand has smaller stride or more dense.
//
// Affine2 ColumnMajor means the row stride is smaller and Affine2 RowMajor means the column
// stride is smaller.
//
// The Affine2 epilogue reuses AffineN epilogue so it does not need to specify column majore
// or row major.
using LayoutInputA = cutlass::layout::AffineRank2ColumnMajor;
using LayoutInputB = cutlass::layout::AffineRank2RowMajor;
using LayoutOutput = cutlass::layout::AffineRankN<2>;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using MMAOp = cutlass::arch::OpClassTensorOp;
// This code section describes CUDA SM architecture number
using SmArch = cutlass::arch::Sm80;
// This code section describes the tile size a thread block will compute
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 16>; // Threadblock tile shape
// This code section describes tile size a warp will compute
using WarpShape = cutlass::gemm::GemmShape<64, 32, 16>; // Warp tile shape
// This code section describes the size of MMA op
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; // TensorCore instruction shape
// This code section describes how threadblocks are scheduled on GPU
using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>;
// Number of pipelines you want to use
constexpr int NumStages = 3;
// This code section describes the epilogue part of the kernel, we use default value
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
ElementOutput, // Data type of output matrix.
1, // The number of elements per memory
// access has. It has to be 1 for
// affine2.
ElementComputeEpilogue>;
using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmUniversal<
ElementInputA, LayoutInputA, cutlass::ComplexTransform::kNone, 1, // AlignmentA has to be 1
ElementInputB, LayoutInputB, cutlass::ComplexTransform::kNone, 1, // AlignmentB has to be 1
ElementOutput, LayoutOutput,
ElementAccumulator,
MMAOp,
SmArch,
ThreadblockShape,
WarpShape,
InstructionShape,
EpilogueOp,
SwizzleThreadBlock,
NumStages,
cutlass::arch::OpMultiplyAdd
>::GemmKernel;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
/////////////////////////////////////////////////////////////////////////////////////////////////
int run() {
// Construct Gemm ProblemSize with user defined output size
cutlass::gemm::GemmCoord problem_size = {1024, 512, 1024};
// Stride factor shows the distance between two elements in the differnet dimensions. The
// first data is the logical distance between two rows, the second is between two columns.
// CUTLASS has a utility tool cutlass::layout::Affine2Layout_Factory<Layout>::layout_factory
// to help to convert stride_factor to the two strides.
//
// It is also totally fine to compute the strides directly without using the utility to
// construct the affine2 layout.
typename LayoutInputA::Stride::Index stride_factor_A[] = {3, 4};
typename LayoutInputB::Stride::Index stride_factor_B[] = {5, 6};
typename LayoutOutput::Stride::Index stride_factor_C[] = {7, 8};
// Initialize tensors using CUTLASS helper functions
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(problem_size.mk(),
cutlass::layout::Affine2Layout_Factory<LayoutInputA>::layout_factory(problem_size.mk(),
stride_factor_A));
cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(problem_size.kn(),
cutlass::layout::Affine2Layout_Factory<LayoutInputB>::layout_factory(problem_size.kn(),
stride_factor_B));
// Create matrix C used to load for bias addition.
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(problem_size.mn(),
cutlass::layout::Affine2Layout_Factory<LayoutOutput>::layout_factory(problem_size.mn(),
stride_factor_C));
// Create matrix D used to store output from CUTLASS kernel
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(problem_size.mn(),
cutlass::layout::Affine2Layout_Factory<LayoutOutput>::layout_factory(problem_size.mn(),
stride_factor_C));
// Create matrix D with dimensions M x N used to store output from reference
// kernel
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(problem_size.mn(),
cutlass::layout::Affine2Layout_Factory<LayoutOutput>::layout_factory(problem_size.mn(),
stride_factor_C));
// Fill input and output matrices on host using CUTLASS helper functions
cutlass::reference::host::TensorFillRandomUniform(
tensor_a.host_view(),
1,
ElementInputA(4),
ElementInputA(-4),
0); // <- Fill matrix A on host with uniform-distribution random data
cutlass::reference::host::TensorFillRandomUniform(
tensor_b.host_view(),
1,
ElementInputB(4),
ElementInputB(-4),
0); // <- Fill matrix B on host with uniform-distribution random data
cutlass::reference::host::TensorFillRandomUniform(
tensor_c.host_view(),
1,
ElementOutput(4),
ElementOutput(-4),
0); // <- Fill matrix C on host with uniform-distribution random data
cutlass::reference::host::TensorFill(
tensor_d.host_view()); // <- fill matrix D on host with zeros
cutlass::reference::host::TensorFill(
tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros
// Copy data from host to GPU
tensor_a.sync_device();
tensor_b.sync_device();
tensor_c.sync_device();
tensor_d.sync_device();
tensor_ref_d.sync_device();
// Initialize alpha for dot product computation
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
ElementComputeEpilogue beta = ElementComputeEpilogue(1);
cutlass::gemm::GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
int batch_count = 1;
// Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
// instantiated CUTLASS kernel
typename Gemm::Arguments arguments{
mode,
problem_size,
batch_count,
{alpha, beta},
tensor_a.device_ref().data(), // <- reference to matrix A on device
tensor_b.device_ref().data(), // <- reference to matrix B on device
tensor_c.device_ref().data(), // <- reference to matrix C on device
tensor_d.device_ref().data(), // <- reference to matrix D on device
tensor_a.layout().capacity(problem_size.mn()),
tensor_b.layout().capacity(problem_size.kn()),
tensor_c.layout().capacity(problem_size.mn()),
tensor_d.layout().capacity(problem_size.mn()),
tensor_a.layout().stride(),
tensor_b.layout().stride(),
tensor_c.layout().stride(),
tensor_d.layout().stride()
};
// Instantiate CUTLASS kernel depending on templates
Gemm gemm_op;
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Check the problem size is supported or not
cutlass::Status status = gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Launch initialized CUTLASS kernel
status = gemm_op();
CUTLASS_CHECK(status);
//
// Create instantiation for device reference gemm kernel
//
// Launch device reference to compute strictly the product A * B
cutlass::reference::device::Gemm<
ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementComputeEpilogue,
ElementAccumulator> gemm_device;
gemm_device
(
problem_size,
alpha,
tensor_a.device_ref(),
tensor_b.device_ref(),
beta,
tensor_c.device_ref(),
tensor_ref_d.device_ref()
);
// Wait for kernels to finish
cudaDeviceSynchronize();
// Copy output data from CUTLASS and reference kernel to host for comparison
tensor_d.sync_host();
tensor_ref_d.sync_host();
bool pass = cutlass::reference::host::TensorEquals(tensor_d.host_view(),
tensor_ref_d.host_view());
// Check if output from CUTLASS kernel and reference kernel are equal or not
std::cout << (pass
? "Passed"
: "Failed")
<< std::endl;
CUTLASS_CHECK(status);
return 0;
}
int main(int argc, char const **args) {
bool notSupported = false;
// Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
//
// CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
notSupported = true;
}
cudaDeviceProp props;
CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
<< std::endl;
notSupported = true;
}
if (notSupported) {
return 0;
}
return run();
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,27 @@
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
19_tensorop_canonical
tensorop_canonical.cu
)

View File

@ -0,0 +1,432 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*
This example requires NVIDIA Ampere GPU or later.
*/
// Standard Library includes
#include <iostream>
#include <sstream>
#include <vector>
// CUTLASS Includes
#include "cutlass/cutlass.h"
#include "cutlass/functional.h"
#include "cutlass/layout/matrix.h"
#include "cutlass/gemm/warp/default_mma_tensor_op.h"
#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
// CUTLASS Utility Includes
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/gemm_complex.h"
///////////////////////////////////////////////////////////////////////////////////////////////////
// Define the overal warp-level problem shape
int const kM = 27;
int const kN = 31;
int const kK = 17;
///////////////////////////////////////////////////////////////////////////////////////////////////
// Define a warp-level GEMM operator.
//
// This template could be part of the CUTLASS Template Library or implemented internally. This
// wraps the matrix multiply operation and epilogue with a GEMM-like interface that can be
// instantiated in device code.
namespace cutlass {
namespace gemm {
namespace warp {
template <
typename Shape,
typename InstructionShape,
typename ElementA,
typename LayoutA,
typename ElementB,
typename LayoutB,
typename ElementC,
typename LayoutC,
typename ElementScalar
>
class GemmTensorOp {
public:
using WarpShape = GemmShape<
((Shape::kM + InstructionShape::kM - 1) / InstructionShape::kM) * InstructionShape::kM,
((Shape::kN + InstructionShape::kN - 1) / InstructionShape::kN) * InstructionShape::kN,
InstructionShape::kK
>;
using MmaWarp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
WarpShape,
InstructionShape,
double, // Data type of A elements
cutlass::layout::RowMajor, // Layout of A matrix
double, // Data type of B elements
cutlass::layout::ColumnMajor, // Layout of B matrix
double, // Data type of C elements
cutlass::layout::RowMajor // Layout of C matrix
>::Type;
// Number of 'K groups'
int const kKgroups = (Shape::kK + InstructionShape::kK - 1) / InstructionShape::kK;
// Define a 'FragmentIterator' to iterate over slices of accumulators
using FragmentIterator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
typename MmaWarp::Shape,
InstructionShape,
double,
typename MmaWarp::Policy::Operator::FragmentC,
cutlass::layout::RowMajor
>;
// Define an epilogue 'Tile Iteterator' to iterate over slices of elements in Shared Memory
using AccumulatorTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpCanonical<
typename MmaWarp::Shape,
InstructionShape,
double,
cutlass::layout::RowMajor
>;
using TensorRefA = typename MmaWarp::IteratorA::TensorRef;
using TensorRefB = typename MmaWarp::IteratorB::TensorRef;
using TensorRefC = typename AccumulatorTileIterator::TensorRef;
public:
CUTLASS_HOST_DEVICE
GemmTensorOp() { }
CUTLASS_DEVICE
void operator()(
ElementScalar alpha,
TensorRefA ref_A,
TensorRefB ref_B,
ElementScalar beta,
TensorRefC ref_C,
TensorRefC ref_D,
int lane_id) const {
// Instantiate iterators pointing to slices of the A and B matrices in shared memory
typename MmaWarp::IteratorA iter_A(ref_A, {Shape::kM, Shape::kK}, lane_id);
typename MmaWarp::IteratorB iter_B(ref_B, {Shape::kK, Shape::kN}, lane_id);
// Instantiate and clear accumulator tile holding the C matrix
typename MmaWarp::FragmentC accum;
accum.clear();
// Instantiate the warp-level matrix multiply operator
MmaWarp mma_op;
// Instantiate fragments holding the slice of the matrix held by each warp
typename MmaWarp::FragmentA frag_A[2];
typename MmaWarp::FragmentB frag_B[2];
// Load fragments from shared memory
iter_A.load(frag_A[0]);
iter_B.load(frag_B[0]);
++iter_A;
++iter_B;
// Load fragments from shared memory
CUTLASS_PRAGMA_UNROLL
for (int k = 0; k < kKgroups; ++k) {
// Load fragments from shared memory
iter_A.load(frag_A[(k + 1) % 2]);
iter_B.load(frag_B[(k + 1) % 2]);
++iter_A;
++iter_B;
// Compute the matrix multiply
mma_op(accum, frag_A[k % 2], frag_B[k % 2], accum);
}
// Instantiate iterators
FragmentIterator accum_frag_it(accum);
AccumulatorTileIterator source_tile_it(ref_C, {Shape::kM, Shape::kN}, lane_id);
AccumulatorTileIterator dest_tile_it(ref_D, {Shape::kM, Shape::kN}, lane_id);
// Define function objects for linear scaling operation
cutlass::multiplies<typename FragmentIterator::Fragment> mul_source;
cutlass::multiply_add<typename FragmentIterator::Fragment> mul_add_accumulator;
// Iterate over the epilogue components
CUTLASS_PRAGMA_UNROLL
for (int idx = 0; idx < FragmentIterator::kIterations; ++idx) {
// Define storage for slices of the accumulators
typename FragmentIterator::Fragment accum_fragment;
typename FragmentIterator::Fragment source_fragment;
// Select a slice of accumulators from the accumulator tile
accum_frag_it.load(accum_fragment);
++accum_frag_it;
// Load a corresponding slice from Shared memory
source_tile_it.load(source_fragment);
++source_tile_it;
// Compute linear scaling - alpha * AB + beta * C
source_fragment = mul_source(beta, source_fragment);
accum_fragment = mul_add_accumulator(alpha, accum_fragment, source_fragment);
// Store the result to shared memory
dest_tile_it.store(accum_fragment);
++dest_tile_it;
}
}
};
} // namespace warp
} // namespace gemm
} // namespace cutlass
///////////////////////////////////////////////////////////////////////////////////////////////////
// Sample kernel demonstrating a collective GEMM operation by a warp on arbitrary matrices held
// in Shared Memory.
__global__ void kernel(
double *D_gmem,
double alpha,
double const *A_gmem,
double const *B_gmem,
double beta,
double const *C_gmem) {
// Define several matrices in shared memory
__shared__ double A[kM][kK];
__shared__ double B[kN][kK];
__shared__ double C[kM][kN];
// Copy data into SMEM
if (threadIdx.x == 0) {
CUTLASS_PRAGMA_NO_UNROLL
for (int m = 0; m < kM; ++m) {
for (int k = 0; k < kK; ++k) {
A[m][k] = A_gmem[m * kK + k];
}
}
CUTLASS_PRAGMA_NO_UNROLL
for (int n = 0; n < kN; ++n) {
for (int k = 0; k < kK; ++k) {
B[n][k] = B_gmem[n * kK + k];
}
}
CUTLASS_PRAGMA_NO_UNROLL
for (int m = 0; m < kM; ++m) {
CUTLASS_PRAGMA_NO_UNROLL
for (int n = 0; n < kN; ++n) {
C[m][n] = C_gmem[m * kN + n];
}
}
}
__syncthreads();
//
// Instantiate a warp-level matrix multiply operator given the fundamental instruction shape (8x8x4),
// overall shape, data type of each operand, and layout of each operand.
//
using GemmTensorOp = cutlass::gemm::warp::GemmTensorOp<
cutlass::gemm::GemmShape<kM, kN, kK>,
cutlass::gemm::GemmShape<8, 8, 4>,
double, // Data type of A elements
cutlass::layout::RowMajor, // Layout of A matrix
double, // Data type of B elements
cutlass::layout::ColumnMajor, // Layout of B matrix
double, // Data type of C elements
cutlass::layout::RowMajor, // Layout of C matrix
double // Scalar type of alpha and beta
>;
// Instantiate the GEMM operator
GemmTensorOp gemm;
// Execute the warp-level GEMM operation
gemm(
alpha,
{&A[0][0], kK},
{&B[0][0], kK},
beta,
{&C[0][0], kN},
{&C[0][0], kN},
threadIdx.x);
__syncthreads();
// Copy data into SMEM
if (threadIdx.x == 0) {
CUTLASS_PRAGMA_NO_UNROLL
for (int m = 0; m < kM; ++m) {
CUTLASS_PRAGMA_NO_UNROLL
for (int n = 0; n < kN; ++n) {
D_gmem[m * kN + n] = C[m][n];
}
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
/// Entry point to canonical warp-level GEMM operation
int main(int argc, const char *arg[]) {
bool notSupported = false;
// CUTLASS must be compiled with CUDA 11 Toolkit to run these examples.
if (!(__CUDACC_VER_MAJOR__ >= 11)) {
std::cerr << "NVIDIA Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
notSupported = true;
}
cudaDeviceProp props;
cudaError_t error = cudaGetDeviceProperties(&props, 0);
if (error != cudaSuccess) {
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
return -1;
}
if (!((props.major * 10 + props.minor) >= 80)) {
std::cerr << "This example requires compute capability at least 80."
<< std::endl;
notSupported = true;
}
if (notSupported) {
// Return 0 so tests are considered passing if run on unsupported platforms.
return 0;
}
cutlass::HostTensor<double, cutlass::layout::RowMajor> A({kM, kK});
cutlass::HostTensor<double, cutlass::layout::ColumnMajor> B({kK, kN});
cutlass::HostTensor<double, cutlass::layout::RowMajor> C({kM, kN});
cutlass::HostTensor<double, cutlass::layout::RowMajor> D({kM, kN});
uint64_t seed = 2020;
double max = 8;
double min = -8;
cutlass::reference::host::TensorFillRandomUniform(
A.host_view(),
seed,
max,
min,
0
);
cutlass::reference::host::TensorFillRandomUniform(
B.host_view(),
seed + 17,
max,
min,
0
);
cutlass::reference::host::TensorFillRandomUniform(
C.host_view(),
seed + 31,
max,
min,
0
);
A.sync_device();
B.sync_device();
C.sync_device();
D.sync_device();
dim3 grid(1,1);
dim3 block(32, 1, 1);
double alpha = 2.25;
double beta = 1.24;
kernel<<< grid, block >>>(
D.device_data(),
alpha,
A.device_data(),
B.device_data(),
beta,
C.device_data()
);
cudaError_t result = cudaDeviceSynchronize();
if (result != cudaSuccess) {
std::cerr << "Failed to synchronize device after kernel launch." << std::endl;
return -1;
}
D.sync_host();
// Compute reference on host
cutlass::HostTensor<double, cutlass::layout::RowMajor> D_ref({kM, kN}, false);
cutlass::reference::host::GemmComplex(
{kM, kN, kK},
alpha,
A.host_ref(),
cutlass::ComplexTransform::kNone,
B.host_ref(),
cutlass::ComplexTransform::kNone,
beta,
C.host_ref(),
D_ref.host_ref(),
double()
);
// Verify reference matches computed
if (!cutlass::reference::host::TensorEquals(
D.host_view(),
D_ref.host_view())) {
std::cerr
<< "A =\n" << A.host_view()
<< "\n\nB = \n" << B.host_view()
<< "\n\nC = " << C.host_view()
<< "\n\nRef =\n" << D_ref.host_view()
<< "\n\nD =\n" << D.host_view() << "\n\n";
std::cerr << "Error - device results mismatch host reference." << std::endl;
return -1;
}
std::cout << "Passed" << std::endl;
return 0;
}
///////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,27 @@
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
20_simt_canonical
simt_canonical.cu
)

View File

@ -0,0 +1,419 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*
This example requires NVIDIA Maxwell GPU or beyond.
*/
// Standard Library includes
#include <iostream>
#include <sstream>
#include <vector>
// CUTLASS Includes
#include "cutlass/cutlass.h"
#include "cutlass/core_io.h"
#include "cutlass/functional.h"
#include "cutlass/layout/matrix.h"
#include "cutlass/gemm/warp/mma_simt.h"
#include "cutlass/epilogue/warp/fragment_iterator_simt.h"
#include "cutlass/epilogue/warp/tile_iterator_simt.h"
// CUTLASS Utility Includes
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/host/gemm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/gemm_complex.h"
///////////////////////////////////////////////////////////////////////////////////////////////////
// Define the overal warp-level problem shape
int const kM = 14;
int const kN = 27;
int const kK = 17;
///////////////////////////////////////////////////////////////////////////////////////////////////
// Define a warp-level GEMM operator.
//
// This template could be part of the CUTLASS Template Library or implemented internally. This
// wraps the matrix multiply operation and epilogue with a GEMM-like interface that can be
// instantiated in device code.
namespace cutlass {
namespace gemm {
namespace warp {
template <
typename Shape,
typename ElementA,
typename LayoutA,
typename ElementB,
typename LayoutB,
typename ElementC,
typename LayoutC,
typename ElementScalar
>
class GemmSimt {
public:
using Policy = cutlass::gemm::warp::MmaSimtPolicy<
cutlass::MatrixShape<4, 8>,
cutlass::layout::RowMajorInterleaved<2>,
cutlass::gemm::GemmShape<4, 4, 1>
>;
using MmaWarp = cutlass::gemm::warp::MmaSimt<
cutlass::gemm::GemmShape<16, 32, 8>,
float,
cutlass::layout::RowMajor,
float,
cutlass::layout::ColumnMajor,
float,
cutlass::layout::RowMajor,
Policy
>;
// Number of 'K groups'
int const kKgroups = Shape::kK;
using FragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
typename MmaWarp::Shape,
typename MmaWarp::ThreadMma,
layout::RowMajor, // SMEM layout
typename MmaWarp::Policy
>;
using AccumulatorTileIterator = cutlass::epilogue::warp::TileIteratorSimtCanonical<
typename MmaWarp::Shape,
typename MmaWarp::ThreadMma,
float, // ElementAccumulator
layout::RowMajor, // SMEM layout
typename MmaWarp::Policy
>;
using TensorRefA = typename MmaWarp::IteratorA::TensorRef;
using TensorRefB = typename MmaWarp::IteratorB::TensorRef;
using TensorRefC = typename AccumulatorTileIterator::TensorRef;
public:
CUTLASS_HOST_DEVICE
GemmSimt() { }
CUTLASS_DEVICE
void operator()(
ElementScalar alpha,
TensorRefA ref_A,
TensorRefB ref_B,
ElementScalar beta,
TensorRefC ref_C,
TensorRefC ref_D,
int lane_id) const {
// Instantiate iterators pointing to slices of the A and B matrices in shared memory
typename MmaWarp::IteratorA iter_A(ref_A, {Shape::kM, Shape::kK}, lane_id);
typename MmaWarp::IteratorB iter_B(ref_B, {Shape::kK, Shape::kN}, lane_id);
// Instantiate and clear accumulator tile holding the C matrix
typename MmaWarp::FragmentC accum;
accum.clear();
// Instantiate the warp-level matrix multiply operator
MmaWarp mma_op;
// Instantiate fragments holding the slice of the matrix held by each warp
typename MmaWarp::FragmentA frag_A[2];
typename MmaWarp::FragmentB frag_B[2];
// Load fragments from shared memory
iter_A.load(frag_A[0]);
iter_B.load(frag_B[0]);
++iter_A;
++iter_B;
// Load fragments from shared memory
CUTLASS_PRAGMA_UNROLL
for (int k = 0; k < kKgroups; ++k) {
// Load fragments from shared memory
iter_A.load(frag_A[(k + 1) % 2]);
iter_B.load(frag_B[(k + 1) % 2]);
++iter_A;
++iter_B;
// Compute the matrix multiply
mma_op(accum, frag_A[k % 2], frag_B[k % 2], accum);
}
// Instantiate iterators
FragmentIterator accum_frag_it(accum);
AccumulatorTileIterator source_tile_it(ref_C, {Shape::kM, Shape::kN}, lane_id);
AccumulatorTileIterator dest_tile_it(ref_D, {Shape::kM, Shape::kN}, lane_id);
// Define function objects for linear scaling operation
cutlass::multiplies<typename FragmentIterator::Fragment> mul_source;
cutlass::multiply_add<typename FragmentIterator::Fragment> mul_add_accumulator;
// Iterate over the epilogue components
CUTLASS_PRAGMA_UNROLL
for (int idx = 0; idx < FragmentIterator::kIterations; ++idx) {
// Define storage for slices of the accumulators
typename FragmentIterator::Fragment accum_fragment;
typename FragmentIterator::Fragment source_fragment;
// Select a slice of accumulators from the accumulator tile
accum_frag_it.load(accum_fragment);
++accum_frag_it;
// Load a corresponding slice from Shared memory
source_tile_it.load(source_fragment);
++source_tile_it;
// Compute linear scaling - alpha * AB + beta * C
source_fragment = mul_source(beta, source_fragment);
accum_fragment = mul_add_accumulator(alpha, accum_fragment, source_fragment);
// Store the result to shared memory
dest_tile_it.store(accum_fragment);
++dest_tile_it;
}
}
};
} // namespace warp
} // namespace gemm
} // namespace cutlass
///////////////////////////////////////////////////////////////////////////////////////////////////
// Sample kernel demonstrating a collective GEMM operation by a warp on arbitrary matrices held
// in Shared Memory.
__global__ void kernel(
float *D_gmem,
float alpha,
float const *A_gmem,
float const *B_gmem,
float beta,
float const *C_gmem) {
// Define several matrices in shared memory
__shared__ float A[kM][kK];
__shared__ float B[kN][kK];
__shared__ float C[kM][kN];
// Copy data into SMEM
if (threadIdx.x == 0) {
CUTLASS_PRAGMA_NO_UNROLL
for (int m = 0; m < kM; ++m) {
for (int k = 0; k < kK; ++k) {
A[m][k] = A_gmem[m * kK + k];
}
}
CUTLASS_PRAGMA_NO_UNROLL
for (int n = 0; n < kN; ++n) {
for (int k = 0; k < kK; ++k) {
B[n][k] = B_gmem[n * kK + k];
}
}
CUTLASS_PRAGMA_NO_UNROLL
for (int m = 0; m < kM; ++m) {
CUTLASS_PRAGMA_NO_UNROLL
for (int n = 0; n < kN; ++n) {
C[m][n] = C_gmem[m * kN + n];
}
}
}
__syncthreads();
//
// Instantiate a warp-level matrix multiply operator given the fundamental instruction shape (8x8x4),
// overall shape, data type of each operand, and layout of each operand.
//
using GemmSimt = cutlass::gemm::warp::GemmSimt<
cutlass::gemm::GemmShape<kM, kN, kK>,
float, // Data type of A elements
cutlass::layout::RowMajor, // Layout of A matrix
float, // Data type of B elements
cutlass::layout::ColumnMajor, // Layout of B matrix
float, // Data type of C elements
cutlass::layout::RowMajor, // Layout of C matrix
float // Scalar type of alpha and beta
>;
// Instantiate the GEMM operator
GemmSimt gemm;
// Execute the warp-level GEMM operation
gemm(
alpha,
{&A[0][0], kK},
{&B[0][0], kK},
beta,
{&C[0][0], kN},
{&C[0][0], kN},
threadIdx.x);
__syncthreads();
// Copy data into SMEM
if (threadIdx.x == 0) {
CUTLASS_PRAGMA_NO_UNROLL
for (int m = 0; m < kM; ++m) {
CUTLASS_PRAGMA_NO_UNROLL
for (int n = 0; n < kN; ++n) {
D_gmem[m * kN + n] = C[m][n];
}
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, const char *arg[]) {
cutlass::HostTensor<float, cutlass::layout::RowMajor> A({kM, kK});
cutlass::HostTensor<float, cutlass::layout::ColumnMajor> B({kK, kN});
cutlass::HostTensor<float, cutlass::layout::RowMajor> C({kM, kN});
cutlass::HostTensor<float, cutlass::layout::RowMajor> D({kM, kN});
uint64_t seed = 2020;
float max = 8;
float min = -8;
std::cout << "Simt canonical GEMM problem size = (" << cutlass::gemm::GemmShape<kM, kN, kK>() <<")" << std::endl;
cutlass::reference::host::TensorFillRandomUniform(
A.host_view(),
seed,
max,
min,
0
);
cutlass::reference::host::TensorFillRandomUniform(
B.host_view(),
seed + 17,
max,
min,
0
);
#if 0 // Debug: fill A sequentially and B as Identity matrix for debugging
cutlass::reference::host::BlockFillSequential(
A.host_view().data(), A.host_view().capacity());
cutlass::reference::host::TensorFillIdentity(B.host_view());
#endif
cutlass::reference::host::TensorFillRandomUniform(
C.host_view(),
seed + 31,
max,
min,
0
);
A.sync_device();
B.sync_device();
C.sync_device();
D.sync_device();
dim3 grid(1, 1);
dim3 block(32, 1, 1);
float alpha = 1.0f;
float beta = 0.0f;
kernel<<< grid, block >>>(
D.device_data(),
alpha,
A.device_data(),
B.device_data(),
beta,
C.device_data()
);
cudaError_t result = cudaDeviceSynchronize();
if (result != cudaSuccess) {
std::cerr << "Failed to synchronize device after kernel launch." << std::endl;
return -1;
}
D.sync_host();
// Compute reference on host
cutlass::HostTensor<float, cutlass::layout::RowMajor> D_ref({kM, kN}, false);
cutlass::reference::host::TensorCopy(D_ref.host_view(), C.host_view());
cutlass::reference::host::Gemm<
float, cutlass::layout::RowMajor,
float, cutlass::layout::ColumnMajor,
float, cutlass::layout::RowMajor,
float, float> reference_gemm;
reference_gemm(
{kM, kN, kK},
alpha,
A.host_ref(),
B.host_ref(),
beta,
D_ref.host_ref(),
float()
);
// Verify reference matches computed
if (!cutlass::reference::host::TensorEquals(
D.host_view(),
D_ref.host_view())) {
std::cerr
<< "A =\n" << A.host_view()
<< "\n\nB = \n" << B.host_view()
<< "\n\nC = " << C.host_view()
<< "\n\nRef =\n" << D_ref.host_view()
<< "\n\nD =\n" << D.host_view() << "\n\n";
std::cerr << "Error - device results mismatch host reference." << std::endl;
return -1;
}
std::cout << "Passed" << std::endl;
return 0;
}
///////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -0,0 +1,27 @@
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
21_quaternion_gemm
quaternion_gemm.cu
)

View File

@ -0,0 +1,448 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/util/command_line.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/tensor_view_io.h"
#include "helper.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Result structure
struct Result {
double runtime_ms;
double gflops;
cutlass::Status status;
cudaError_t error;
bool passed;
//
// Methods
//
Result(
double runtime_ms = 0,
double gflops = 0,
cutlass::Status status = cutlass::Status::kSuccess,
cudaError_t error = cudaSuccess
):
runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct Options {
bool help;
cutlass::gemm::GemmCoord problem_size;
int batch_count;
cutlass::Quaternion<float> alpha;
cutlass::Quaternion<float> beta;
bool reference_check;
int iterations;
Options():
help(false),
problem_size({1024, 1024, 1024}),
batch_count(1),
reference_check(true),
iterations(20),
alpha(1),
beta() { }
bool valid() {
return true;
}
// Parses the command line
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
}
cmd.get_cmd_line_argument("m", problem_size.m());
cmd.get_cmd_line_argument("n", problem_size.n());
cmd.get_cmd_line_argument("k", problem_size.k());
cmd.get_cmd_line_argument("batch", batch_count);
cmd.get_cmd_line_argument("alpha", alpha.w());
cmd.get_cmd_line_argument("alpha_i", alpha.x());
cmd.get_cmd_line_argument("alpha_j", alpha.y());
cmd.get_cmd_line_argument("alpha_k", alpha.z());
cmd.get_cmd_line_argument("beta", beta.w());
cmd.get_cmd_line_argument("beta_i", beta.x());
cmd.get_cmd_line_argument("beta_j", beta.y());
cmd.get_cmd_line_argument("beta_k", beta.z());
cmd.get_cmd_line_argument("iterations", iterations);
}
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "21_quaternion_gemm example\n\n"
<< " This example uses the CUTLASS Library to execute Quaternion GEMM computations.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement.\n\n"
<< " --m <int> GEMM M dimension\n"
<< " --n <int> GEMM N dimension\n"
<< " --k <int> GEMM K dimension\n"
<< " --batch <int> Number of GEMM operations executed in one batch\n"
<< " --alpha <f32> Epilogue scalar alpha (real part)\n"
<< " --alpha_i <f32> Epilogue scalar alpha_i (imaginary part)\n"
<< " --alpha_j <f32> Epilogue scalar alpha_j (imaginary part)\n"
<< " --alpha_k <f32> Epilogue scalar alpha_k (imaginary part)\n"
<< " --beta <f32> Epilogue scalar beta (real part)\n\n"
<< " --beta_i <f32> Epilogue scalar beta_i (imaginary part)\n\n"
<< " --beta_j <f32> Epilogue scalar beta_j (imaginary part)\n\n"
<< " --beta_k <f32> Epilogue scalar beta_k (imaginary part)\n\n"
<< " --iterations <int> Number of profiling iterations to perform.\n\n";
out << "\n\nExamples:\n\n"
<< "$ ./examples/21_quaternion_gemm/21_quaternion_gemm --batch=7 --m=1024 --n=512 --k=1024 \\\n"
<< " --alpha=2 --alpha_i=-2 --beta=0.707 --beta_i=-.707\n\n";
return out;
}
/// Compute performance in GFLOP/s
double gflops(double runtime_s) const {
// Number of real-valued multiply-adds
int64_t fmas = problem_size.product() * batch_count * 16;
// Two flops per multiply-add
return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// The code section below describes datatype for input, output matrices and computation between
// elements in input matrices.
using precision = float;
using Element = cutlass::Quaternion<float>;
using ElementComputeEpilogue = Element; // <- data type of epilogue operations
using ElementAccumulator = Element; // <- data type of accumulator
using ElementInputA = Element; // <- data type of elements in input matrix A
using ElementInputB = Element; // <- data type of elements in input matrix B
using ElementOutput = Element; // <- data type of elements in output matrix D
// The code section below describes matrix layout of input and output matrices. Column Major for
// Matrix A, Row Major for Matrix B and Row Major for Matrix C
using LayoutInputA = cutlass::layout::RowMajor;
using LayoutInputB = cutlass::layout::ColumnMajor;
using LayoutOutput = cutlass::layout::RowMajor;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using MMAOp = cutlass::arch::OpClassSimt;
// This code section describes CUDA SM architecture number
using SmArch = cutlass::arch::Sm50;
// This code section describes the tile size a thread block will compute
using ShapeMMAThreadBlock =
cutlass::gemm::GemmShape<64, 64, 4>; // <- threadblock tile M = 64, N = 64, K = 8
// This code section describes tile size a warp will compute
using ShapeMMAWarp = cutlass::gemm::GemmShape<32, 16, 4>; // <- warp tile M = 32, N = 16, K = 8
// This code section describes the size of MMA op
using ShapeMMAOp = cutlass::gemm::GemmShape<1, 1, 1>; // <- MMA Op tile M = 1, N = 1, K = 1
// This code section describes how threadblocks are scheduled on GPU
using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- Defaults
// This code section describes the epilogue part of the kernel
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
ElementOutput, // <- data type of output matrix
128 / cutlass::sizeof_bits<ElementOutput>::value, // <- the number of elements per vectorized
// memory access. For a byte, it's 16
// elements. This becomes the vector width of
// math instructions in the epilogue too
ElementAccumulator, // <- data type of accumulator
ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function
// Number of pipelines you want to use
constexpr int NumStages = 2;
using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementAccumulator,
MMAOp,
SmArch,
ShapeMMAThreadBlock,
ShapeMMAWarp,
ShapeMMAOp,
EpilogueOp,
SwizzleThreadBlock,
NumStages>;
int run(Options options) {
// PASS/FAIL status
bool passed = true;
// Create a tuple of problem size for matrix multiplication
cutlass::gemm::GemmCoord problem_size = options.problem_size;
// Initialize tensors using CUTLASS helper functions
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
problem_size.mk()); // <- Create matrix A with dimensions M x K
cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
problem_size.kn()); // <- Create matrix B with dimensions K x N
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
problem_size.mn()); // <- Create matrix C with dimensions M x N
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
// CUTLASS kernel
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
// reference kernel
// Fill input and output matrices on host using CUTLASS helper functions
cutlass::reference::host::TensorFillRandomUniform(
tensor_a.host_view(),
1,
4,
-4,
0); // <- Fill matrix A on host with uniform-distribution random data
cutlass::reference::host::TensorFillRandomUniform(
tensor_b.host_view(),
1,
4,
-4,
0); // <- Fill matrix B on host with uniform-distribution random data
cutlass::reference::host::TensorFillRandomUniform(
tensor_c.host_view(),
1,
4,
-4,
0); // <- Fill matrix C on host with uniform-distribution random data
cutlass::reference::host::TensorFill(
tensor_d.host_view()); // <- fill matrix D on host with zeros
cutlass::reference::host::TensorFill(
tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros
// Copy data from host to GPU
tensor_a.sync_device();
tensor_b.sync_device();
tensor_c.sync_device();
tensor_d.sync_device();
tensor_ref_d.sync_device();
// Initialize alpha and beta for dot product computation
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
ElementComputeEpilogue beta = ElementComputeEpilogue(0);
// Split K dimension into 1 partitions
int split_k_slices = 1;
// Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
// instantiated CUTLASS kernel
typename Gemm::Arguments arguments{problem_size, // <- problem size of matrix multiplication
tensor_a.device_ref(), // <- reference to matrix A on device
tensor_b.device_ref(), // <- reference to matrix B on device
tensor_c.device_ref(), // <- reference to matrix C on device
tensor_d.device_ref(), // <- reference to matrix D on device
{alpha, beta}, // <- tuple of alpha and beta
split_k_slices}; // <- k-dimension split factor
// Using the arguments, query for extra workspace required for matrix multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Instantiate CUTLASS kernel depending on templates
Gemm gemm_op;
// Check the problem size is supported or not
cutlass::Status status = gemm_op.can_implement(arguments);
CUTLASS_CHECK(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(status);
// Result structure
Result result;
//
// Construct events
//
cudaEvent_t events[2];
for (auto & event : events) {
result.error = cudaEventCreate(&event);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
return -1;
}
}
// Record an event at the start of a series of GEMMs
result.error = cudaEventRecord(events[0]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
return -1;
}
//
// Run profiling loop
//
for (int iter = 0; iter < options.iterations; ++iter) {
// Launch initialized CUTLASS kernel
status = gemm_op();
CUTLASS_CHECK(status);
}
//
// Stop profiling loop
//
// Record an event when the GEMMs are complete
result.error = cudaEventRecord(events[1]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
return -1;
}
// Wait for work on the device to complete.
result.error = cudaEventSynchronize(events[1]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
return -1;
}
// Measure elapsed runtime
float runtime_ms = 0;
result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
return -1;
}
// Compute average runtime and GFLOPs.
result.runtime_ms = double(runtime_ms) / double(options.iterations);
result.gflops = options.gflops(result.runtime_ms / 1000.0);
// Cleanup
for (auto event : events) {
(void)cudaEventDestroy(event);
}
if (options.reference_check) {
// Create instantiation for device reference gemm kernel
cutlass::reference::device::Gemm<ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementComputeEpilogue,
ElementComputeEpilogue> gemm_device;
// Launch device reference gemm kernel
gemm_device(problem_size,
alpha,
tensor_a.device_ref(),
tensor_b.device_ref(),
beta,
tensor_c.device_ref(),
tensor_ref_d.device_ref());
// Wait for kernels to finish
cudaDeviceSynchronize();
// Copy output data from CUTLASS and reference kernel to host for comparison
tensor_d.sync_host();
tensor_ref_d.sync_host();
// Check if output from CUTLASS kernel and reference kernel are equal or not
passed &= cutlass::reference::host::TensorEquals(
tensor_d.host_view(),
tensor_ref_d.host_view());
}
if (passed) {
std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
std::cout << " GFLOPs: " << result.gflops << std::endl;
}
std::cout << (passed ? "Passed" : "Failed") << std::endl;
return (passed ? 0 : -1);
}
int main(int argc, char const** argv) {
Options options;
options.parse(argc, argv);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
printf("%d x %d x %d Single Precision Quaternion Matrix Multiply\n", \
options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
if (!options.valid()) {
std::cerr << "Invalid problem." << std::endl;
return -1;
}
return run(options);
}

View File

@ -0,0 +1,28 @@
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted
# provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright notice, this list of
# conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other materials
# provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
# to endorse or promote products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cutlass_example_add_executable(
22_quaternion_conv
quaternion_conv.cu
)

View File

@ -0,0 +1,660 @@
/***************************************************************************************************
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include <iostream>
#include <sstream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
#include "cutlass/conv/device/implicit_gemm_convolution.h"
#include "cutlass/util/command_line.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/reference/device/gemm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/convolution.h"
#include "cutlass/util/tensor_view_io.h"
#include "helper.h"
// The code section below describes datatype for input, output tensors and computation between
// elements
using Element = cutlass::Quaternion<float>;
using ElementAccumulator = Element; // Data type of accumulator
using ElementComputeEpilogue = Element; // Data type of epilogue computation (alpha, beta)
using ElementInputA = Element; // Data type of elements in input tensor
using ElementInputB = Element; // Data type of elements in input tensor
using ElementOutput = Element; // Data type of elements in output tensor
using LayoutInputA = cutlass::layout::TensorNHWC;
using LayoutInputB = cutlass::layout::TensorNHWC;
using LayoutOutput = cutlass::layout::TensorNHWC;
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
using MMAOp = cutlass::arch::OpClassSimt;
// This code section describes CUDA SM architecture number
using SmArch = cutlass::arch::Sm50;
// This code section describes the tile size a thread block will compute
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; // Threadblock tile shape
// This code section describes tile size a warp will compute
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; // Warp tile shape
// This code section describes the size of MMA op
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; // SIMT instruction shape
// This code section describes how threadblocks are scheduled on GPU
using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
// Number of pipelines you want to use
constexpr int NumStages = 2;
// This code section describe iterator algorithm selected is Analytic or Optimized
static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized;
// This code section describes the epilogue part of the kernel, we use default value
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
ElementOutput, // Data type of output matrix.
128 / cutlass::sizeof_bits<ElementOutput>::value, // The number of elements per vectorized.
// memory access. This becomes the vector width of
// math instructions in the epilogue too.
ElementAccumulator, // Data type of accumulator
ElementComputeEpilogue>; // Data type for alpha/beta in linear combination
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
ElementInputA, LayoutInputA,
ElementInputB, LayoutInputB,
ElementOutput, LayoutOutput,
ElementAccumulator,
MMAOp,
SmArch,
ThreadblockShape,
WarpShape,
InstructionShape,
EpilogueOp,
SwizzleThreadBlock,
NumStages,
cutlass::arch::OpMultiplyAdd,
IteratorAlgorithm
>::Kernel;
using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
/////////////////////////////////////////////////////////////////////////////////////////////////
// Command line options parsing
struct Options {
bool help;
cutlass::Tensor4DCoord input_size;
cutlass::Tensor4DCoord filter_size;
cutlass::Tensor4DCoord padding;
cutlass::MatrixCoord conv_stride;
cutlass::MatrixCoord dilation;
bool reference_check;
bool measure_performance;
int iterations;
bool save_workspace;
ElementComputeEpilogue alpha;
ElementComputeEpilogue beta;
bool benchmark;
std::string tag;
Options():
help(false),
input_size(1, 32, 32, 32),
filter_size(32, 3, 3, 32),
padding(1, 1, 1, 1),
conv_stride(1, 1),
dilation(1, 1),
reference_check(false),
measure_performance(true),
iterations(20),
save_workspace(false),
alpha(1),
beta(0),
benchmark(false) { }
// Verify the problem size is compatible with the CUTLASS Convolution implementation.
bool valid() {
//
// CUTLASS attempts to load 128b vectors of cutlass::half_t (F16) elements. Consequently,
// all pointers, strides, and tensor extents must be divisible by 8 elements.
//
int const kAlignment = 8;
if ((input_size.c() % kAlignment) ||
(filter_size.n() % kAlignment)) {
// misaligned tensors
return false;
}
// Invalid padding
if ((padding.h() != filter_size.h() / 2) ||
(padding.w() != filter_size.w() / 2)) {
return false;
}
return true;
}
/// Updates input and filter sizes
void update(
cutlass::Tensor4DCoord input_size,
cutlass::Tensor4DCoord filter_size) {
this->input_size = input_size;
this->filter_size = filter_size;
padding.n() = filter_size.h() / 2;
padding.h() = filter_size.h() / 2;
padding.w() = filter_size.w() / 2;
padding.c() = filter_size.w() / 2;
}
// Parses the command line
void parse(int argc, char const **args) {
cutlass::CommandLine cmd(argc, args);
if (cmd.check_cmd_line_flag("help")) {
help = true;
}
if (cmd.check_cmd_line_flag("ref-check")) {
reference_check = true;
}
if (cmd.check_cmd_line_flag("perf-check")) {
measure_performance = true;
}
if (cmd.check_cmd_line_flag("save-workspace")) {
save_workspace = true;
}
if (cmd.check_cmd_line_flag("benchmark")) {
benchmark = true;
}
cmd.get_cmd_line_argument("n", input_size.n());
cmd.get_cmd_line_argument("h", input_size.h());
cmd.get_cmd_line_argument("w", input_size.w());
cmd.get_cmd_line_argument("c", input_size.c());
cmd.get_cmd_line_argument("k", filter_size.n());
cmd.get_cmd_line_argument("r", filter_size.h());
cmd.get_cmd_line_argument("s", filter_size.w());
filter_size.c() = input_size.c();
cmd.get_cmd_line_argument("alpha_w", alpha.w());
cmd.get_cmd_line_argument("alpha_x", alpha.x());
cmd.get_cmd_line_argument("alpha_y", alpha.y());
cmd.get_cmd_line_argument("alpha_z", alpha.z());
cmd.get_cmd_line_argument("beta_w", beta.w());
cmd.get_cmd_line_argument("beta_x", beta.x());
cmd.get_cmd_line_argument("beta_y", beta.y());
cmd.get_cmd_line_argument("beta_z", beta.z());
cmd.get_cmd_line_argument("iterations", iterations);
cmd.get_cmd_line_argument("tag", tag);
if (filter_size.h() == 3 && filter_size.w() == 3) {
padding = {1, 1, 1, 1};
}
else {
filter_size.h() = 1;
filter_size.w() = 1;
padding = {0, 0, 0, 0};
}
}
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {
out << "22_quaternion_conv example\n\n"
<< " This example uses Ampere's Tensor Core operators on F16 data types to compute\n"
<< " forward convolution on tensors of layout NHWC.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement.\n\n"
<< " --n <int> Input tensor extent N\n"
<< " --h <int> Input tensor extent H\n"
<< " --w <int> Input tensor extent W\n"
<< " --c <int> Input tensor extent C\n"
<< " --k <int> Filter extent K\n"
<< " --r <int> Filter extent R\n"
<< " --s <int> Filter extent S\n\n"
<< " --alpha <float> Epilogue scalar alpha\n"
<< " --beta <float> Epilogue scalar beta\n\n"
<< " --ref-check If set (true), reference check on the host is computed\n"
<< " --perf-check If set (true), performance is measured.\n"
<< " --benchmark If set (true), performance benchmarking on several layers and batch-size.\n"
<< " --iterations <int> Number of profiling iterations to perform.\n"
<< " --save-workspace If set, workspace is written to a text file.\n"
<< " --tag <string> String to replicate across the first column in the results table\n";
out << "\n\nExamples:\n\n"
<< "$ ./examples/22_quaternion_conv/22_quaternion_conv --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
<< "$ ./examples/22_quaternion_conv/22_quaternion_conv --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
return out;
}
/// Computes the output tensor size (NPQK)
cutlass::Tensor4DCoord output_size() const {
return cutlass::Tensor4DCoord(
input_size.n(),
(input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
(input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
filter_size.n());
}
/// Compute performance in GFLOP/s
double gflops(double runtime_s) const {
// Number of multiply-adds = NPQK * CRS
int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c()) * 16;
// Two flops per multiply-add
return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
struct Result {
double runtime_ms;
double gflops;
cutlass::Status status;
cutlass::Status reference_check;
cudaError_t error;
Result():
runtime_ms(0),
gflops(0),
status(cutlass::Status::kSuccess),
reference_check(cutlass::Status::kInvalid),
error(cudaSuccess) { }
static std::ostream & print_header(std::ostream &out, Options const &options) {
if (!options.tag.empty()) {
out << "Name,";
}
out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs";
return out;
}
std::ostream & print(std::ostream &out, int idx, Options const &options) {
if (!options.tag.empty()) {
out << options.tag << ",";
}
out
<< "conv_" << idx << ","
<< options.input_size.n() << ","
<< options.input_size.h() << ","
<< options.input_size.w() << ","
<< options.input_size.c() << ","
<< options.filter_size.n() << ","
<< options.filter_size.h() << ","
<< options.filter_size.w() << ","
<< runtime_ms << ","
<< gflops;
return out;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Runs one benchmark
Result profile_convolution(Options const &options) {
Result result;
//
// Allocate host-device tensors using the CUTLASS Utilities.
//
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_c(options.output_size());
//
// Initialize tensors
//
// Fill tensor A on host with uniform-distribution random data
cutlass::reference::host::TensorFillRandomUniform(
tensor_a.host_view(),
1,
7,
-8,
0);
// Fill tensor B on host with uniform-distribution random data
cutlass::reference::host::TensorFillRandomUniform(
tensor_b.host_view(),
1,
7,
-8,
0);
// Fill tensor C on host with zeros
cutlass::reference::host::TensorFill(
tensor_c.host_view());
// Fill tensor C for reference on host with zeros
cutlass::reference::host::TensorFill(
tensor_ref_c.host_view());
// Copy data from host to GPU
tensor_a.sync_device();
tensor_b.sync_device();
tensor_c.sync_device();
tensor_ref_c.sync_device();
//
// Define arguments for CUTLASS Convolution
//
cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
// Split K dimension into 1 partitions
int split_k_slices = 1;
// Construct Conv2dProblemSize with user defined output size
cutlass::conv::Conv2dProblemSize problem_size(
options.input_size,
options.filter_size,
options.padding,
options.conv_stride,
options.dilation,
options.output_size(),
mode,
split_k_slices
);
// Construct ImplicitGemm::Argument structure with conv2d
// problem size, data pointers, and epilogue values
typename ImplicitGemm::Arguments arguments{
problem_size,
tensor_a.device_ref(),
tensor_b.device_ref(),
tensor_c.device_ref(),
tensor_c.device_ref(),
{options.alpha, options.beta},
};
//
// Initialize CUTLASS Convolution
//
ImplicitGemm implicit_gemm_op;
size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
result.status = implicit_gemm_op.can_implement(arguments);
CUTLASS_CHECK(result.status);
result.status = implicit_gemm_op.initialize(arguments, workspace.get());
CUTLASS_CHECK(result.status);
//
// Launch initialized CUTLASS kernel
//
result.status = implicit_gemm_op();
CUTLASS_CHECK(result.status);
//
// Optional reference check
//
if (options.reference_check) {
std::cout << "Verification on host...\n";
// Compute with reference implementation
cutlass::reference::host::Conv2dFprop<
ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput,
LayoutOutput,
ElementComputeEpilogue,
ElementAccumulator,
cutlass::NumericConverter<ElementOutput, ElementComputeEpilogue>
>(
problem_size,
tensor_a.host_ref(),
tensor_b.host_ref(),
tensor_c.host_ref(),
tensor_ref_c.host_ref(),
options.alpha,
options.beta
);
// Check if output from CUTLASS kernel and reference kernel are equal or not
tensor_c.sync_host();
bool passed = cutlass::reference::host::TensorEquals(
tensor_c.host_view(),
tensor_ref_c.host_view());
if (!passed) {
result.reference_check = cutlass::Status::kErrorInternal;
std::cout << "ERROR - results miscompared.\n";
}
else {
result.reference_check = cutlass::Status::kSuccess;
std::cout << "Passed.\n";
}
}
else {
result.reference_check = cutlass::Status::kInvalid;
}
if (options.save_workspace) {
std::stringstream ss;
ss << "22_quaternion_conv_"
<< options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c()
<< "_"
<< options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c()
<< ".dat";
std::ofstream output_workspace(ss.str());
output_workspace
<< "Input = \n" << tensor_a.host_view() << "\n\n"
<< "Filters = \n" << tensor_b.host_view() << "\n\n";
if (options.reference_check) {
output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n";
}
output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl;
std::cout << "Results written to '" << ss.str() << "'." << std::endl;
}
//
// Performance measurement
//
if (options.measure_performance) {
cudaEvent_t events[2];
for (auto & event : events) {
result.error = cudaEventCreate(&event);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
return result;
}
}
// Record an event at the start of a series of convolution operations.
result.error = cudaEventRecord(events[0]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
return result;
}
// Launch a sequence of implicit GEMM operations on the device
for (int iteration = 0; iteration < options.iterations; ++iteration) {
result.status = implicit_gemm_op();
CUTLASS_CHECK(result.status);
}
// Record an event when the convolutions have been launched.
result.error = cudaEventRecord(events[1]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
return result;
}
// Wait for work on the device to complete.
result.error = cudaEventSynchronize(events[1]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
return result;
}
// Measure elapsed runtime
float runtime_ms = 0;
result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
if (result.error != cudaSuccess) {
std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
return result;
}
// Print average runtime and GFLOPs.
result.runtime_ms = double(runtime_ms) / double(options.iterations);
result.gflops = options.gflops(result.runtime_ms / 1000.0);
// Cleanup
for (auto event : events) {
(void)cudaEventDestroy(event);
}
}
return result;
}
/////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char const **args) {
Options options;
options.parse(argc, args);
if (options.help) {
options.print_usage(std::cout) << std::endl;
return 0;
}
if (options.benchmark) {
// Benchmark several layers
int batch_sizes[] = {1, 32, 64, 128, 256, 512};
struct Benchmark {
int h, w, c, k, r, s;
} layers[] = {
{56, 56, 64, 256, 1, 1},
{56, 56, 64, 64, 1, 1},
{56, 56, 64, 64, 3, 3},
{56, 56, 256, 64, 1, 1},
{56, 56, 256, 512, 1, 1},
{56, 56, 256, 128, 1, 1},
{28, 28, 128, 128, 3, 3},
{28, 28, 128, 512, 1, 1},
{28, 28, 512, 128, 1, 1},
{28, 28, 512, 1024, 1, 1},
{28, 28, 512, 256, 1, 1},
{14, 14, 256, 256, 3, 3},
{14, 14, 256, 1024, 1, 1},
{14, 14, 1024, 256, 1, 1},
{14, 14, 1024, 2048, 1, 1},
{14, 14, 1024, 512, 1, 1},
{7, 7, 512, 512, 3, 3},
};
Result::print_header(std::cout, options) << std::endl;
int idx = 1;
for (auto const &layer : layers) {
for (auto N : batch_sizes) {
options.update({N, layer.h, layer.w, layer.c}, {layer.k, layer.r, layer.s, layer.c});
Result result = profile_convolution(options);
result.print(std::cout, idx, options) << std::endl;
}
++idx;
}
}
else {
// Execute one problem size
if (!options.valid()) {
std::cerr << "Invalid problem." << std::endl;
return -1;
}
Result result = profile_convolution(options);
Result::print_header(std::cout, options) << std::endl;
result.print(std::cout, 1, options) << std::endl;
}
return 0;
}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -28,10 +28,14 @@ add_custom_target(test_examples)
function(cutlass_example_add_executable NAME)
set(options)
set(oneValueArgs)
set(oneValueArgs DISABLE_TESTS)
set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (NOT DEFINED __DISABLE_TESTS)
set(__DISABLE_TESTS OFF)
endif()
cutlass_add_executable(${NAME} ${__UNPARSED_ARGUMENTS})
add_dependencies(cutlass_examples ${NAME})
@ -60,6 +64,7 @@ function(cutlass_example_add_executable NAME)
DEPENDEES test_examples ${__DEPENDEES}
TEST_COMMAND_OPTIONS ${__TEST_COMMAND_OPTIONS}
DISABLE_EXECUTABLE_INSTALL_RULE
DISABLE_TESTS ${__DISABLE_TESTS}
)
endfunction()
@ -83,6 +88,11 @@ foreach(EXAMPLE
15_ampere_sparse_tensorop_gemm
16_ampere_tensorop_conv2dfprop
17_fprop_per_channel_bias
18_ampere_fp64_tensorop_affine2_gemm
19_tensorop_canonical
20_simt_canonical
21_quaternion_gemm
22_quaternion_conv
)
add_subdirectory(${EXAMPLE})