@ -119,12 +119,12 @@ cudaError_t cutlass_hgemm_nn(
|
||||
int K,
|
||||
cutlass::half_t alpha,
|
||||
cutlass::half_t const *A,
|
||||
int lda,
|
||||
cutlass::layout::ColumnMajor::Stride::Index lda,
|
||||
cutlass::half_t const *B,
|
||||
int ldb,
|
||||
cutlass::layout::ColumnMajor::Stride::Index ldb,
|
||||
cutlass::half_t beta,
|
||||
cutlass::half_t *C,
|
||||
int ldc) {
|
||||
cutlass::layout::ColumnMajor::Stride::Index ldc) {
|
||||
|
||||
// Define the GEMM operation
|
||||
using Gemm = cutlass::gemm::device::Gemm<
|
||||
|
||||
@ -67,7 +67,7 @@ beta * C).
|
||||
Now that we setup the properties of data, we have to setup properties of computation.
|
||||
|
||||
Second, we create template variables of tile sizes for thread-block, warp and mma-op to 128x128x32,
|
||||
64x64x4, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
|
||||
64x64x32, 8x8x4 (MxNxK) respectively. When passed to instantiate CUTLASS GEMM kernel, it internally
|
||||
deduce the amount of threads needed per thread-block, amount of shared memory, storing data in
|
||||
bank-conflict free manner, and ton of other variables required to compose, intialize and launch a
|
||||
high performance GEMM kernel. This is the beauty of CUTLASS, it relieves developer from
|
||||
|
||||
@ -275,10 +275,10 @@ public:
|
||||
int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
|
||||
int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
|
||||
|
||||
int lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
|
||||
int ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
|
||||
int ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
|
||||
int ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
|
||||
typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
|
||||
typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
|
||||
typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
|
||||
typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
|
||||
|
||||
int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
|
||||
int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
|
||||
|
||||
@ -292,10 +292,11 @@ public:
|
||||
int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
|
||||
int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
|
||||
|
||||
int lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
|
||||
int ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
|
||||
int ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
|
||||
int ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
|
||||
typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
|
||||
typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
|
||||
typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
|
||||
typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
|
||||
|
||||
|
||||
int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
|
||||
int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
|
||||
|
||||
@ -48,6 +48,10 @@ addition to its own input activation tile. Therefore the input activation warp t
|
||||
2nd GEMM/Conv only depends on the output warp accumulator of the 1st GEMM/Conv in the
|
||||
register file, and the operation can be fully register-file-resident.
|
||||
|
||||
When applying the above constraint to convolutions, it is required that the 2nd Convolution
|
||||
kernel doesn't have halos such that data used by each threadblock doesn't depend on any other
|
||||
threadblock. Typically this requires the 2nd Convolution uses 1x1 filter without any paddings.
|
||||
|
||||
# Copyright
|
||||
|
||||
Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
@ -36,8 +36,6 @@
|
||||
#include "device/b2b_implicit_gemm_convolution.h"
|
||||
#include "b2b_conv2d_run.h"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_0 (
|
||||
@ -57,7 +55,7 @@ cutlass::conv::Conv2dProblemSize conv2d_f16_sm75_problem_size_1 (
|
||||
{128, 56, 56, 64} // output size (NPQK)
|
||||
);
|
||||
|
||||
void run_nonfused_conv2d_fprop_f16_sm75() {
|
||||
bool run_nonfused_conv2d_fprop_f16_sm75() {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using ElementB = cutlass::half_t;
|
||||
@ -90,7 +88,8 @@ void run_nonfused_conv2d_fprop_f16_sm75() {
|
||||
ElementC,
|
||||
128 / cutlass::sizeof_bits<ElementC>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
|
||||
2,
|
||||
@ -135,9 +134,10 @@ void run_nonfused_conv2d_fprop_f16_sm75() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_fused_conv2d_fprop_f16_sm75() {
|
||||
bool run_fused_conv2d_fprop_f16_sm75() {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using ElementB = cutlass::half_t;
|
||||
@ -161,7 +161,8 @@ void run_fused_conv2d_fprop_f16_sm75() {
|
||||
ElementC,
|
||||
InstructionShape::kM * InstructionShape::kN / 32,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
using EpilogueOutputOp1 =
|
||||
@ -207,9 +208,10 @@ void run_fused_conv2d_fprop_f16_sm75() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_nonfused_conv2d_fprop_optimized_f16_sm75() {
|
||||
bool run_nonfused_conv2d_fprop_optimized_f16_sm75() {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using ElementB = cutlass::half_t;
|
||||
@ -242,7 +244,8 @@ void run_nonfused_conv2d_fprop_optimized_f16_sm75() {
|
||||
ElementC,
|
||||
128 / cutlass::sizeof_bits<ElementC>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
|
||||
2,
|
||||
@ -287,9 +290,10 @@ void run_nonfused_conv2d_fprop_optimized_f16_sm75() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_fused_conv2d_fprop_optimized_f16_sm75() {
|
||||
bool run_fused_conv2d_fprop_optimized_f16_sm75() {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using ElementB = cutlass::half_t;
|
||||
@ -313,7 +317,8 @@ void run_fused_conv2d_fprop_optimized_f16_sm75() {
|
||||
ElementC,
|
||||
InstructionShape::kM * InstructionShape::kN / 32,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
using EpilogueOutputOp1 =
|
||||
@ -359,10 +364,8 @@ void run_fused_conv2d_fprop_optimized_f16_sm75() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#endif // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
|
||||
|
||||
|
||||
@ -36,8 +36,6 @@
|
||||
#include "device/b2b_implicit_gemm_convolution.h"
|
||||
#include "b2b_conv2d_run.h"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_0 (
|
||||
@ -57,7 +55,7 @@ cutlass::conv::Conv2dProblemSize conv2d_f16_sm80_problem_size_1 (
|
||||
{128, 56, 56, 64} // output size (NPQK)
|
||||
);
|
||||
|
||||
void run_nonfused_conv2d_fprop_f16_sm80() {
|
||||
bool run_nonfused_conv2d_fprop_f16_sm80() {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using ElementB = cutlass::half_t;
|
||||
@ -90,7 +88,8 @@ void run_nonfused_conv2d_fprop_f16_sm80() {
|
||||
ElementC,
|
||||
128 / cutlass::sizeof_bits<ElementC>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
|
||||
3,
|
||||
@ -135,9 +134,10 @@ void run_nonfused_conv2d_fprop_f16_sm80() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_fused_conv2d_fprop_f16_sm80() {
|
||||
bool run_fused_conv2d_fprop_f16_sm80() {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using ElementB = cutlass::half_t;
|
||||
@ -161,7 +161,8 @@ void run_fused_conv2d_fprop_f16_sm80() {
|
||||
ElementC,
|
||||
InstructionShape::kM * InstructionShape::kN / 32,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
using EpilogueOutputOp1 =
|
||||
@ -205,9 +206,10 @@ void run_fused_conv2d_fprop_f16_sm80() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_nonfused_conv2d_fprop_optimized_f16_sm80() {
|
||||
bool run_nonfused_conv2d_fprop_optimized_f16_sm80() {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using ElementB = cutlass::half_t;
|
||||
@ -240,7 +242,8 @@ void run_nonfused_conv2d_fprop_optimized_f16_sm80() {
|
||||
ElementC,
|
||||
128 / cutlass::sizeof_bits<ElementC>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
|
||||
3,
|
||||
@ -285,9 +288,10 @@ void run_nonfused_conv2d_fprop_optimized_f16_sm80() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_fused_conv2d_fprop_optimized_f16_sm80() {
|
||||
bool run_fused_conv2d_fprop_optimized_f16_sm80() {
|
||||
|
||||
using ElementA = cutlass::half_t;
|
||||
using ElementB = cutlass::half_t;
|
||||
@ -311,7 +315,8 @@ void run_fused_conv2d_fprop_optimized_f16_sm80() {
|
||||
ElementC,
|
||||
InstructionShape::kM * InstructionShape::kN / 32,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
using EpilogueOutputOp1 =
|
||||
@ -355,9 +360,8 @@ void run_fused_conv2d_fprop_optimized_f16_sm80() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#endif // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
|
||||
|
||||
|
||||
@ -36,8 +36,6 @@
|
||||
#include "device/b2b_implicit_gemm_convolution.h"
|
||||
#include "b2b_interleaved_conv2d_run.h"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_0 (
|
||||
@ -57,7 +55,7 @@ cutlass::conv::Conv2dProblemSize conv2d_s8_sm75_problem_size_1 (
|
||||
{128, 56, 56, 64} // output size (NPQK)
|
||||
);
|
||||
|
||||
void run_nonfused_conv2d_fprop_s8_sm75() {
|
||||
bool run_nonfused_conv2d_fprop_s8_sm75() {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using ElementB = int8_t;
|
||||
@ -90,7 +88,8 @@ void run_nonfused_conv2d_fprop_s8_sm75() {
|
||||
ElementC,
|
||||
64 / cutlass::sizeof_bits<ElementC>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
|
||||
2,
|
||||
@ -135,9 +134,10 @@ void run_nonfused_conv2d_fprop_s8_sm75() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_fused_conv2d_fprop_s8_sm75() {
|
||||
bool run_fused_conv2d_fprop_s8_sm75() {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using ElementB = int8_t;
|
||||
@ -161,7 +161,8 @@ void run_fused_conv2d_fprop_s8_sm75() {
|
||||
ElementC,
|
||||
InstructionShape::kM * InstructionShape::kN / 32,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
using EpilogueOutputOp1 =
|
||||
@ -207,9 +208,10 @@ void run_fused_conv2d_fprop_s8_sm75() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_nonfused_conv2d_fprop_optimized_s8_sm75() {
|
||||
bool run_nonfused_conv2d_fprop_optimized_s8_sm75() {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using ElementB = int8_t;
|
||||
@ -242,7 +244,8 @@ void run_nonfused_conv2d_fprop_optimized_s8_sm75() {
|
||||
ElementC,
|
||||
64 / cutlass::sizeof_bits<ElementC>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
|
||||
2,
|
||||
@ -287,9 +290,10 @@ void run_nonfused_conv2d_fprop_optimized_s8_sm75() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_fused_conv2d_fprop_optimized_s8_sm75() {
|
||||
bool run_fused_conv2d_fprop_optimized_s8_sm75() {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using ElementB = int8_t;
|
||||
@ -313,7 +317,8 @@ void run_fused_conv2d_fprop_optimized_s8_sm75() {
|
||||
ElementC,
|
||||
InstructionShape::kM * InstructionShape::kN / 32,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
using EpilogueOutputOp1 =
|
||||
@ -359,9 +364,8 @@ void run_fused_conv2d_fprop_optimized_s8_sm75() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#endif // if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
|
||||
|
||||
|
||||
@ -36,8 +36,6 @@
|
||||
#include "device/b2b_implicit_gemm_convolution.h"
|
||||
#include "b2b_interleaved_conv2d_run.h"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_0 (
|
||||
@ -57,7 +55,7 @@ cutlass::conv::Conv2dProblemSize conv2d_s8_sm80_problem_size_1 (
|
||||
{128, 56, 56, 64} // output size (NPQK)
|
||||
);
|
||||
|
||||
void run_nonfused_conv2d_fprop_s8_sm80() {
|
||||
bool run_nonfused_conv2d_fprop_s8_sm80() {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using ElementB = int8_t;
|
||||
@ -90,7 +88,8 @@ void run_nonfused_conv2d_fprop_s8_sm80() {
|
||||
ElementC,
|
||||
64 / cutlass::sizeof_bits<ElementC>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
|
||||
3,
|
||||
@ -135,9 +134,10 @@ void run_nonfused_conv2d_fprop_s8_sm80() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_fused_conv2d_fprop_s8_sm80() {
|
||||
bool run_fused_conv2d_fprop_s8_sm80() {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using ElementB = int8_t;
|
||||
@ -161,7 +161,8 @@ void run_fused_conv2d_fprop_s8_sm80() {
|
||||
ElementC,
|
||||
8 * InstructionShape::kN / 32,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
using EpilogueOutputOp1 =
|
||||
@ -207,9 +208,10 @@ void run_fused_conv2d_fprop_s8_sm80() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_nonfused_conv2d_fprop_optimized_s8_sm80() {
|
||||
bool run_nonfused_conv2d_fprop_optimized_s8_sm80() {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using ElementB = int8_t;
|
||||
@ -242,7 +244,8 @@ void run_nonfused_conv2d_fprop_optimized_s8_sm80() {
|
||||
ElementC,
|
||||
64 / cutlass::sizeof_bits<ElementC>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
|
||||
3,
|
||||
@ -287,9 +290,10 @@ void run_nonfused_conv2d_fprop_optimized_s8_sm80() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_fused_conv2d_fprop_optimized_s8_sm80() {
|
||||
bool run_fused_conv2d_fprop_optimized_s8_sm80() {
|
||||
|
||||
using ElementA = int8_t;
|
||||
using ElementB = int8_t;
|
||||
@ -313,7 +317,8 @@ void run_fused_conv2d_fprop_optimized_s8_sm80() {
|
||||
ElementC,
|
||||
8 * InstructionShape::kN / 32,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
using EpilogueOutputOp1 =
|
||||
@ -359,10 +364,9 @@ void run_fused_conv2d_fprop_optimized_s8_sm80() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#endif // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
|
||||
|
||||
|
||||
@ -39,14 +39,12 @@
|
||||
#include "device/b2b_gemm.h"
|
||||
#include "b2b_gemm_run.h"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_0(128*1600, 64, 576);
|
||||
cutlass::gemm::GemmCoord gemm_f16_sm75_problem_size_1(128*1600, 128, 64);
|
||||
|
||||
void run_nonfused_gemm_f16() {
|
||||
bool run_nonfused_gemm_f16() {
|
||||
|
||||
using ElementOutput = cutlass::half_t;
|
||||
using ElementAccumulator = cutlass::half_t;
|
||||
@ -80,7 +78,8 @@ void run_nonfused_gemm_f16() {
|
||||
ElementOutput,
|
||||
128 / cutlass::sizeof_bits<ElementOutput>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
|
||||
2
|
||||
@ -116,9 +115,11 @@ void run_nonfused_gemm_f16() {
|
||||
std::cout << "Pass\n";
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_fused_gemm_f16() {
|
||||
bool run_fused_gemm_f16() {
|
||||
|
||||
using ElementOutput = cutlass::half_t;
|
||||
using ElementAccumulator = cutlass::half_t;
|
||||
@ -140,7 +141,8 @@ void run_fused_gemm_f16() {
|
||||
ElementOutput,
|
||||
InstructionShape::kM * InstructionShape::kN / 32,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
using EpilogueOutputOp1 =
|
||||
@ -183,7 +185,6 @@ void run_fused_gemm_f16() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return passed;
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#endif //#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
|
||||
|
||||
@ -39,14 +39,12 @@
|
||||
#include "device/b2b_gemm.h"
|
||||
#include "b2b_gemm_run.h"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_0(128*1600, 64, 576);
|
||||
cutlass::gemm::GemmCoord gemm_f16_sm80_problem_size_1(128*1600, 128, 64);
|
||||
|
||||
void run_nonfused_gemm_f16_sm80() {
|
||||
bool run_nonfused_gemm_f16_sm80() {
|
||||
|
||||
using ElementOutput = cutlass::half_t;
|
||||
using ElementAccumulator = cutlass::half_t;
|
||||
@ -80,7 +78,8 @@ void run_nonfused_gemm_f16_sm80() {
|
||||
ElementOutput,
|
||||
128 / cutlass::sizeof_bits<ElementOutput>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
|
||||
3
|
||||
@ -116,9 +115,11 @@ void run_nonfused_gemm_f16_sm80() {
|
||||
std::cout << "Pass\n";
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_fused_gemm_f16_sm80() {
|
||||
bool run_fused_gemm_f16_sm80() {
|
||||
|
||||
using ElementOutput = cutlass::half_t;
|
||||
using ElementAccumulator = cutlass::half_t;
|
||||
@ -140,7 +141,8 @@ void run_fused_gemm_f16_sm80() {
|
||||
ElementOutput,
|
||||
InstructionShape::kM * InstructionShape::kN / 32,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
using EpilogueOutputOp1 =
|
||||
@ -183,7 +185,7 @@ void run_fused_gemm_f16_sm80() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return passed;
|
||||
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#endif //#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
|
||||
|
||||
@ -39,14 +39,12 @@
|
||||
#include "device/b2b_gemm.h"
|
||||
#include "b2b_interleaved_gemm_run.h"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_0(128*1600, 64, 576);
|
||||
cutlass::gemm::GemmCoord gemm_s8_sm75_problem_size_1(128*1600, 128, 64);
|
||||
|
||||
void run_nonfused_gemm_s8() {
|
||||
bool run_nonfused_gemm_s8() {
|
||||
|
||||
using ElementOutput = int8_t;
|
||||
using ElementAccumulator = int32_t;
|
||||
@ -80,7 +78,8 @@ void run_nonfused_gemm_s8() {
|
||||
ElementOutput,
|
||||
64 / cutlass::sizeof_bits<ElementOutput>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>,
|
||||
2
|
||||
@ -116,9 +115,11 @@ void run_nonfused_gemm_s8() {
|
||||
std::cout << "Pass\n";
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_fused_gemm_s8() {
|
||||
bool run_fused_gemm_s8() {
|
||||
|
||||
using ElementOutput = int8_t;
|
||||
using ElementAccumulator = int32_t;
|
||||
@ -140,7 +141,8 @@ void run_fused_gemm_s8() {
|
||||
ElementOutput,
|
||||
InstructionShape::kM * InstructionShape::kN / 32,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
using EpilogueOutputOp1 =
|
||||
@ -151,8 +153,6 @@ void run_fused_gemm_s8() {
|
||||
ElementCompute
|
||||
>;
|
||||
|
||||
|
||||
|
||||
using B2bGemm = cutlass::gemm::device::B2bGemm<
|
||||
int8_t,
|
||||
cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
@ -183,7 +183,7 @@ void run_fused_gemm_s8() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return passed;
|
||||
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
|
||||
|
||||
@ -39,14 +39,12 @@
|
||||
#include "device/b2b_gemm.h"
|
||||
#include "b2b_interleaved_gemm_run.h"
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_0(128*1600, 64, 576);
|
||||
cutlass::gemm::GemmCoord gemm_s8_sm80_problem_size_1(128*1600, 128, 64);
|
||||
|
||||
void run_nonfused_gemm_s8_sm80() {
|
||||
bool run_nonfused_gemm_s8_sm80() {
|
||||
|
||||
using ElementOutput = int8_t;
|
||||
using ElementAccumulator = int32_t;
|
||||
@ -80,7 +78,8 @@ void run_nonfused_gemm_s8_sm80() {
|
||||
ElementOutput,
|
||||
64 / cutlass::sizeof_bits<ElementOutput>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
|
||||
3,
|
||||
@ -106,7 +105,8 @@ void run_nonfused_gemm_s8_sm80() {
|
||||
ElementOutput,
|
||||
64 / cutlass::sizeof_bits<ElementOutput>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>,
|
||||
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
|
||||
3,
|
||||
@ -124,9 +124,11 @@ void run_nonfused_gemm_s8_sm80() {
|
||||
std::cout << "Pass\n";
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
void run_fused_gemm_s8_sm80() {
|
||||
bool run_fused_gemm_s8_sm80() {
|
||||
|
||||
using ElementOutput = int8_t;
|
||||
using ElementAccumulator = int32_t;
|
||||
@ -148,7 +150,8 @@ void run_fused_gemm_s8_sm80() {
|
||||
ElementOutput,
|
||||
8 * InstructionShape::kN / 32,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
using EpilogueOutputOp1 =
|
||||
@ -156,11 +159,10 @@ void run_fused_gemm_s8_sm80() {
|
||||
ElementOutput,
|
||||
64 / cutlass::sizeof_bits<ElementOutput>::value,
|
||||
ElementAccumulator,
|
||||
ElementCompute
|
||||
ElementCompute,
|
||||
cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
|
||||
>;
|
||||
|
||||
|
||||
|
||||
using B2bGemm = cutlass::gemm::device::B2bGemm<
|
||||
int8_t,
|
||||
cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
@ -183,8 +185,7 @@ void run_fused_gemm_s8_sm80() {
|
||||
16,
|
||||
16,
|
||||
false,
|
||||
cutlass::arch::OpMultiplyAddSaturate,
|
||||
true
|
||||
cutlass::arch::OpMultiplyAddSaturate
|
||||
>;
|
||||
|
||||
B2bInterleavedFusedGemmRun<B2bGemm, 32> fusedGemm;
|
||||
@ -196,7 +197,6 @@ void run_fused_gemm_s8_sm80() {
|
||||
else
|
||||
std::cout << "Fail\n";
|
||||
|
||||
return passed;
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
|
||||
|
||||
@ -115,9 +115,7 @@ template <
|
||||
/// Operation performed by GEMM
|
||||
typename Operator_ = typename DefaultGemmConfiguration<
|
||||
OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
|
||||
ElementAccumulator_>::Operator,
|
||||
/// Whether Beta is zero or not
|
||||
bool IsBetaZero = false>
|
||||
ElementAccumulator_>::Operator>
|
||||
class B2bGemm {
|
||||
public:
|
||||
|
||||
@ -148,7 +146,6 @@ class B2bGemm {
|
||||
static int const kAlignmentB = AlignmentB;
|
||||
static int const kAlignmentC = EpilogueOutputOp1::kCount;
|
||||
static bool const kSplitKSerial = SplitKSerial;
|
||||
static bool const kIsBetaZero = IsBetaZero;
|
||||
static ComplexTransform const kTransformA = ComplexTransform::kNone;
|
||||
static ComplexTransform const kTransformB = ComplexTransform::kNone;
|
||||
|
||||
@ -175,8 +172,7 @@ class B2bGemm {
|
||||
ThreadblockSwizzle,
|
||||
kStages,
|
||||
kSplitKSerial,
|
||||
Operator,
|
||||
kIsBetaZero
|
||||
Operator
|
||||
>::B2bGemmKernel;
|
||||
|
||||
/// Argument structure
|
||||
@ -422,7 +418,7 @@ public:
|
||||
void *workspace = nullptr,
|
||||
cudaStream_t stream = nullptr) {
|
||||
|
||||
Status status = initialize(args, workspace);
|
||||
Status status = initialize(args, workspace, stream);
|
||||
|
||||
if (status == Status::kSuccess) {
|
||||
status = run(stream);
|
||||
|
||||
@ -255,7 +255,7 @@ public:
|
||||
void *workspace = nullptr,
|
||||
cudaStream_t stream = nullptr) {
|
||||
|
||||
Status status = initialize(args, workspace);
|
||||
Status status = initialize(args, workspace, stream);
|
||||
|
||||
if (status == Status::kSuccess) {
|
||||
status = run(stream);
|
||||
|
||||
@ -28,53 +28,14 @@
|
||||
#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm75.h"
|
||||
#include "b2b_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.h"
|
||||
|
||||
int run() {
|
||||
|
||||
cudaDeviceProp props;
|
||||
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
if (error != cudaSuccess) {
|
||||
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!(props.major * 10 + props.minor >= 75)) {
|
||||
std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75."
|
||||
<< std::endl;
|
||||
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
|
||||
std::cout << "Running on SM80" << std::endl;
|
||||
run_nonfused_conv2d_fprop_optimized_f16_sm80();
|
||||
run_fused_conv2d_fprop_optimized_f16_sm80();
|
||||
run_nonfused_conv2d_fprop_optimized_s8_sm80();
|
||||
run_fused_conv2d_fprop_optimized_s8_sm80();
|
||||
#elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
|
||||
std::cout << "Running on SM75" << std::endl;
|
||||
run_nonfused_conv2d_fprop_optimized_f16_sm75();
|
||||
run_fused_conv2d_fprop_optimized_f16_sm75();
|
||||
run_nonfused_conv2d_fprop_optimized_s8_sm75();
|
||||
run_fused_conv2d_fprop_optimized_s8_sm75();
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main() {
|
||||
|
||||
int run_sm75() {
|
||||
bool notSupported = false;
|
||||
|
||||
// Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
|
||||
//
|
||||
// CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
|
||||
if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
|
||||
std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
|
||||
|
||||
notSupported = true;
|
||||
|
||||
}
|
||||
|
||||
cudaDeviceProp props;
|
||||
@ -85,10 +46,7 @@ int main() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!(props.major * 10 + props.minor >= 75)) {
|
||||
std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75."
|
||||
<< std::endl;
|
||||
|
||||
if (!(props.major == 7 && props.minor >= 5)) {
|
||||
notSupported = true;
|
||||
}
|
||||
|
||||
@ -96,7 +54,83 @@ int main() {
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
|
||||
return run();
|
||||
|
||||
bool pass = 1;
|
||||
|
||||
std::cout << "Running on SM75" << std::endl;
|
||||
pass &= run_nonfused_conv2d_fprop_optimized_f16_sm75();
|
||||
pass &= run_fused_conv2d_fprop_optimized_f16_sm75();
|
||||
pass &= run_nonfused_conv2d_fprop_optimized_s8_sm75();
|
||||
pass &= run_fused_conv2d_fprop_optimized_s8_sm75();
|
||||
|
||||
if(pass)
|
||||
return 1;
|
||||
else
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
int run_sm80() {
|
||||
bool notSupported = false;
|
||||
|
||||
// Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
|
||||
//
|
||||
// CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
|
||||
if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
|
||||
notSupported = true;
|
||||
}
|
||||
|
||||
cudaDeviceProp props;
|
||||
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
if (error != cudaSuccess) {
|
||||
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!(props.major == 8 && props.minor >= 0)) {
|
||||
notSupported = true;
|
||||
}
|
||||
|
||||
if (notSupported) {
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool pass = 1;
|
||||
|
||||
std::cout << "Running on SM80" << std::endl;
|
||||
pass &= run_nonfused_conv2d_fprop_optimized_f16_sm80();
|
||||
pass &= run_fused_conv2d_fprop_optimized_f16_sm80();
|
||||
pass &= run_nonfused_conv2d_fprop_optimized_s8_sm80();
|
||||
pass &= run_fused_conv2d_fprop_optimized_s8_sm80();
|
||||
|
||||
if(pass)
|
||||
return 1;
|
||||
else
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
|
||||
int result = 0;
|
||||
|
||||
result = run_sm80();
|
||||
|
||||
if(!result) { // not supported
|
||||
result = run_sm75();
|
||||
|
||||
if(!result) {
|
||||
std::cout << "This example isn't supported on current architecture" << std::endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if(result >= 0)
|
||||
return 0;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
@ -28,36 +28,15 @@
|
||||
#include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h"
|
||||
#include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm80.h"
|
||||
|
||||
int run() {
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED)
|
||||
std::cout << "Running on SM80" << std::endl;
|
||||
run_nonfused_gemm_f16_sm80();
|
||||
run_fused_gemm_f16_sm80();
|
||||
run_nonfused_gemm_s8_sm80();
|
||||
run_fused_gemm_s8_sm80();
|
||||
#elif defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED)
|
||||
std::cout << "Running on SM75" << std::endl;
|
||||
run_nonfused_gemm_f16();
|
||||
run_fused_gemm_f16();
|
||||
run_nonfused_gemm_s8();
|
||||
run_fused_gemm_s8();
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main() {
|
||||
|
||||
int run_sm75() {
|
||||
bool notSupported = false;
|
||||
|
||||
// Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2.
|
||||
//
|
||||
// CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples.
|
||||
if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
|
||||
std::cerr << "Tensor Core operations used in this example must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
|
||||
|
||||
notSupported = true;
|
||||
|
||||
}
|
||||
|
||||
cudaDeviceProp props;
|
||||
@ -68,10 +47,7 @@ int main() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!(props.major * 10 + props.minor >= 75)) {
|
||||
std::cerr << "Tensor Ops used in this example must be run on a machine with compute capability at least 75."
|
||||
<< std::endl;
|
||||
|
||||
if (!(props.major == 7 && props.minor >= 5)) {
|
||||
notSupported = true;
|
||||
}
|
||||
|
||||
@ -80,6 +56,86 @@ int main() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return run();
|
||||
bool pass = true;
|
||||
|
||||
std::cout << "Running on SM75" << std::endl;
|
||||
pass &= run_nonfused_gemm_f16();
|
||||
pass &= run_fused_gemm_f16();
|
||||
pass &= run_nonfused_gemm_s8();
|
||||
pass &= run_fused_gemm_s8();
|
||||
|
||||
if(pass)
|
||||
return 1;
|
||||
else
|
||||
return -1;
|
||||
|
||||
|
||||
}
|
||||
|
||||
int run_sm80() {
|
||||
bool notSupported = false;
|
||||
|
||||
// Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
|
||||
//
|
||||
// CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
|
||||
if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
|
||||
notSupported = true;
|
||||
|
||||
}
|
||||
|
||||
cudaDeviceProp props;
|
||||
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
if (error != cudaSuccess) {
|
||||
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!(props.major == 8 && props.minor >= 0)) {
|
||||
notSupported = true;
|
||||
}
|
||||
|
||||
if (notSupported) {
|
||||
// Returning zero so this test passes on older Toolkits. Its actions are no-op.
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool pass = true;
|
||||
|
||||
std::cout << "Running on SM80" << std::endl;
|
||||
pass &= run_nonfused_gemm_f16_sm80();
|
||||
pass &= run_fused_gemm_f16_sm80();
|
||||
pass &= run_nonfused_gemm_s8_sm80();
|
||||
pass &= run_fused_gemm_s8_sm80();
|
||||
|
||||
if(pass)
|
||||
return 1;
|
||||
else
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
|
||||
int result = 0;
|
||||
|
||||
result = run_sm80();
|
||||
|
||||
if(!result) { // not supported
|
||||
result = run_sm75();
|
||||
|
||||
if(!result) {
|
||||
std::cout << "This example isn't supported on current architecture" << std::endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if(result >= 0)
|
||||
return 0;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@ -66,6 +66,7 @@ struct B2bGemm {
|
||||
cutlass::gemm::GemmCoord problem_size_0;
|
||||
cutlass::gemm::GemmCoord problem_size_1;
|
||||
cutlass::gemm::GemmCoord grid_tiled_shape;
|
||||
int swizzle_log_tile;
|
||||
typename B2bMma::IteratorA0::Params params_A0;
|
||||
typename B2bMma::IteratorA0::TensorRef ref_A0;
|
||||
typename B2bMma::IteratorB0::Params params_B0;
|
||||
@ -91,7 +92,7 @@ struct B2bGemm {
|
||||
//
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(): semaphore(0), gemm_k_iterations_0(0), gemm_k_size_0(0),
|
||||
Params(): swizzle_log_tile(0), semaphore(0), gemm_k_iterations_0(0), gemm_k_size_0(0),
|
||||
gemm_k_iterations_1(0), gemm_k_size_1(0) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
@ -112,6 +113,7 @@ struct B2bGemm {
|
||||
problem_size_0(problem_size_0),
|
||||
problem_size_1(problem_size_1),
|
||||
grid_tiled_shape(grid_tiled_shape),
|
||||
swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
|
||||
params_A0(ref_A0.layout()),
|
||||
ref_A0(ref_A0),
|
||||
params_B0(ref_B0.layout()),
|
||||
@ -211,7 +213,7 @@ struct B2bGemm {
|
||||
ThreadblockSwizzle threadblock_swizzle;
|
||||
|
||||
cutlass::gemm::GemmCoord threadblock_tile_offset =
|
||||
threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
|
||||
threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
|
||||
|
||||
// Early exit if CTA is out of range
|
||||
if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
|
||||
@ -315,7 +317,7 @@ struct B2bGemm {
|
||||
//
|
||||
|
||||
threadblock_tile_offset =
|
||||
threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
|
||||
threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
|
||||
|
||||
//assume identity swizzle
|
||||
MatrixCoord threadblock_offset(
|
||||
|
||||
@ -209,6 +209,7 @@ struct B2bImplicitGemmConvolution {
|
||||
cutlass::gemm::GemmCoord grid_tiled_shape;
|
||||
gemm::GemmCoord implicit_gemm_problem_size_0;
|
||||
gemm::GemmCoord implicit_gemm_problem_size_1;
|
||||
int swizzle_log_tile;
|
||||
int gemm_k_iterations_0;
|
||||
int gemm_k_iterations_1;
|
||||
typename B2bMma::IteratorA0::Params iterator_A0;
|
||||
@ -233,7 +234,7 @@ struct B2bImplicitGemmConvolution {
|
||||
//
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(): gemm_k_iterations_0(0), gemm_k_iterations_1(0) { }
|
||||
Params(): swizzle_log_tile(0), gemm_k_iterations_0(0), gemm_k_iterations_1(0) { }
|
||||
|
||||
///
|
||||
CUTLASS_HOST_DEVICE
|
||||
@ -245,7 +246,6 @@ struct B2bImplicitGemmConvolution {
|
||||
problem_size_1(args.problem_size_1),
|
||||
implicit_gemm_problem_size_0(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_0)),
|
||||
implicit_gemm_problem_size_1(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size_1)),
|
||||
grid_tiled_shape(grid_tiled_shape),
|
||||
iterator_A0(B2bMma::IteratorA0::getParams(args.problem_size_0, args.ref_A0.layout())),
|
||||
ptr_A0(args.ref_A0.data()),
|
||||
iterator_B0(args.problem_size_0, args.ref_B0.layout()),
|
||||
@ -272,6 +272,8 @@ struct B2bImplicitGemmConvolution {
|
||||
implicit_gemm_problem_size_0,
|
||||
{ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
|
||||
args.problem_size_0.split_k_slices);
|
||||
|
||||
swizzle_log_tile = ThreadblockSwizzle().get_log_tile(grid_tiled_shape);
|
||||
}
|
||||
};
|
||||
|
||||
@ -296,7 +298,7 @@ struct B2bImplicitGemmConvolution {
|
||||
ThreadblockSwizzle threadblock_swizzle;
|
||||
|
||||
cutlass::gemm::GemmCoord threadblock_tile_idx =
|
||||
threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
|
||||
threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
|
||||
|
||||
// Early exit if CTA is out of range
|
||||
if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
|
||||
@ -379,7 +381,7 @@ struct B2bImplicitGemmConvolution {
|
||||
|
||||
// Compute logical position within grid
|
||||
threadblock_tile_idx =
|
||||
threadblock_swizzle.get_tile_offset(params.grid_tiled_shape);
|
||||
threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
|
||||
|
||||
// If performing a reduction via split-K, fetch the initial synchronization
|
||||
if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
|
||||
|
||||
@ -111,9 +111,7 @@ template <
|
||||
/// If true, kernel is configured to support serial reduction in the epilogue
|
||||
bool SplitKSerial,
|
||||
/// Operation performed by GEMM
|
||||
typename Operator,
|
||||
/// Beta is zero or not
|
||||
bool IsBetaZero = false
|
||||
typename Operator
|
||||
>
|
||||
struct DefaultB2bGemm;
|
||||
|
||||
@ -321,9 +319,7 @@ template <
|
||||
/// epilogue
|
||||
bool SplitKSerial,
|
||||
/// Operation performed by GEMM
|
||||
typename Operator,
|
||||
/// Is Beta zero or not
|
||||
bool IsBetaZero>
|
||||
typename Operator>
|
||||
struct DefaultB2bGemm<
|
||||
ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
|
||||
ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
|
||||
@ -332,7 +328,7 @@ struct DefaultB2bGemm<
|
||||
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
|
||||
InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
|
||||
ThreadblockSwizzle, Stages,
|
||||
SplitKSerial, Operator, IsBetaZero> {
|
||||
SplitKSerial, Operator> {
|
||||
using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
|
||||
using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
|
||||
using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
|
||||
@ -353,8 +349,7 @@ struct DefaultB2bGemm<
|
||||
using Epilogue = typename cutlass::epilogue::threadblock::
|
||||
DefaultInterleavedEpilogueTensorOp<
|
||||
ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
|
||||
64 / sizeof_bits<ElementC>::value, InterleavedK,
|
||||
IsBetaZero>::Epilogue;
|
||||
64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
|
||||
|
||||
/// Define the kernel-level GEMM operator.
|
||||
using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
|
||||
@ -397,9 +392,7 @@ template <
|
||||
/// epilogue
|
||||
bool SplitKSerial,
|
||||
/// Operation performed by GEMM
|
||||
typename Operator,
|
||||
/// Is Beta zero or not
|
||||
bool IsBetaZero>
|
||||
typename Operator>
|
||||
struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
|
||||
kAlignmentA, ElementB,
|
||||
layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
|
||||
@ -407,7 +400,7 @@ struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
|
||||
int32_t, arch::OpClassTensorOp, arch::Sm75,
|
||||
ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
|
||||
InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1,
|
||||
ThreadblockSwizzle, 2, SplitKSerial, Operator, IsBetaZero> {
|
||||
ThreadblockSwizzle, 2, SplitKSerial, Operator> {
|
||||
using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
|
||||
using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
|
||||
using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
|
||||
@ -426,8 +419,7 @@ struct DefaultB2bGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
|
||||
using Epilogue = typename cutlass::epilogue::threadblock::
|
||||
DefaultInterleavedEpilogueTensorOp<
|
||||
ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
|
||||
64 / sizeof_bits<ElementC>::value, InterleavedK,
|
||||
IsBetaZero>::Epilogue;
|
||||
64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
|
||||
|
||||
/// Define the kernel-level GEMM operator.
|
||||
using B2bGemmKernel = kernel::B2bGemm<B2bMma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
|
||||
|
||||
@ -43,14 +43,122 @@ fp32 data by using NVIDIA Ampere architecture.
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/gemm/device/gemm.h"
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/reference/device/gemm.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_copy.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
|
||||
#include "helper.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Result structure
|
||||
struct Result {
|
||||
|
||||
double runtime_ms;
|
||||
double gflops;
|
||||
cutlass::Status status;
|
||||
cudaError_t error;
|
||||
bool passed;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
Result(
|
||||
double runtime_ms = 0,
|
||||
double gflops = 0,
|
||||
cutlass::Status status = cutlass::Status::kSuccess,
|
||||
cudaError_t error = cudaSuccess
|
||||
):
|
||||
runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Command line options parsing
|
||||
struct Options {
|
||||
|
||||
bool help;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size;
|
||||
int batch_count;
|
||||
float alpha;
|
||||
float beta;
|
||||
|
||||
bool reference_check;
|
||||
int iterations;
|
||||
|
||||
Options():
|
||||
help(false),
|
||||
problem_size({5120, 4096, 4096}),
|
||||
batch_count(1),
|
||||
reference_check(true),
|
||||
iterations(20),
|
||||
alpha(1),
|
||||
beta() { }
|
||||
|
||||
bool valid() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Parses the command line
|
||||
void parse(int argc, char const **args) {
|
||||
cutlass::CommandLine cmd(argc, args);
|
||||
|
||||
if (cmd.check_cmd_line_flag("help")) {
|
||||
help = true;
|
||||
}
|
||||
|
||||
cmd.get_cmd_line_argument("m", problem_size.m());
|
||||
cmd.get_cmd_line_argument("n", problem_size.n());
|
||||
cmd.get_cmd_line_argument("k", problem_size.k());
|
||||
|
||||
cmd.get_cmd_line_argument("alpha", alpha);
|
||||
cmd.get_cmd_line_argument("beta", beta);
|
||||
|
||||
cmd.get_cmd_line_argument("iterations", iterations);
|
||||
|
||||
}
|
||||
|
||||
/// Prints the usage statement.
|
||||
std::ostream & print_usage(std::ostream &out) const {
|
||||
|
||||
out << "14_ampere_tf32_tensorop_gemm example\n\n"
|
||||
<< " This example uses the CUTLASS Library to execute TF32 tensorop GEMM computations.\n\n"
|
||||
<< "Options:\n\n"
|
||||
<< " --help If specified, displays this usage statement.\n\n"
|
||||
<< " --m <int> GEMM M dimension\n"
|
||||
<< " --n <int> GEMM N dimension\n"
|
||||
<< " --k <int> GEMM K dimension\n"
|
||||
<< " --alpha <f32> Epilogue scalar alpha\n"
|
||||
<< " --beta <f32> Epilogue scalar beta\n\n"
|
||||
<< " --iterations <int> Number of profiling iterations to perform.\n\n";
|
||||
|
||||
out << "\n\nExamples:\n\n"
|
||||
<< "$ ./examples/14_ampere_tf32_tensorop_gemm/14_ampere_tf32_tensorop_gemm --m=1024 --n=512 --k=1024 \\\n"
|
||||
<< " --alpha=2 --beta=0.707 \n\n";
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/// Compute performance in GFLOP/s
|
||||
double gflops(double runtime_s) const {
|
||||
|
||||
// Number of real-valued multiply-adds
|
||||
int64_t fmas = problem_size.product() * batch_count;
|
||||
|
||||
// Two flops per multiply-add
|
||||
return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// The code section below describes datatype for input, output matrices and computation between
|
||||
// elements in input matrices.
|
||||
using ElementAccumulator = float; // <- data type of accumulator
|
||||
@ -111,14 +219,10 @@ using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
|
||||
int run() {
|
||||
|
||||
const int length_m = 5120;
|
||||
const int length_n = 4096;
|
||||
const int length_k = 4096;
|
||||
int run(Options &options) {
|
||||
|
||||
// Create a tuple of problem size for matrix multiplication
|
||||
cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k);
|
||||
cutlass::gemm::GemmCoord problem_size = options.problem_size;
|
||||
|
||||
// Initialize tensors using CUTLASS helper functions
|
||||
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
|
||||
@ -166,8 +270,8 @@ int run() {
|
||||
tensor_ref_d.sync_device();
|
||||
|
||||
// Initialize alpha and beta for dot product computation
|
||||
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
|
||||
ElementComputeEpilogue beta = ElementComputeEpilogue(0);
|
||||
ElementComputeEpilogue alpha = ElementComputeEpilogue(options.alpha);
|
||||
ElementComputeEpilogue beta = ElementComputeEpilogue(options.beta);
|
||||
|
||||
// Split K dimension into 1 partitions
|
||||
int split_k_slices = 1;
|
||||
@ -199,9 +303,74 @@ int run() {
|
||||
status = gemm_op.initialize(arguments, workspace.get());
|
||||
CUTLASS_CHECK(status);
|
||||
|
||||
// Launch initialized CUTLASS kernel
|
||||
status = gemm_op();
|
||||
CUTLASS_CHECK(status);
|
||||
// Result structure
|
||||
Result result;
|
||||
|
||||
//
|
||||
// Construct events
|
||||
//
|
||||
|
||||
cudaEvent_t events[2];
|
||||
|
||||
for (auto & event : events) {
|
||||
result.error = cudaEventCreate(&event);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Record an event at the start of a series of GEMMs
|
||||
result.error = cudaEventRecord(events[0]);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
//
|
||||
// Run profiling loop
|
||||
//
|
||||
|
||||
for (int iter = 0; iter < options.iterations; ++iter) {
|
||||
// Launch initialized CUTLASS kernel
|
||||
status = gemm_op();
|
||||
CUTLASS_CHECK(status);
|
||||
}
|
||||
|
||||
//
|
||||
// Stop profiling loop
|
||||
//
|
||||
|
||||
// Record an event when the GEMMs are complete
|
||||
result.error = cudaEventRecord(events[1]);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Wait for work on the device to complete.
|
||||
result.error = cudaEventSynchronize(events[1]);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Measure elapsed runtime
|
||||
float runtime_ms = 0;
|
||||
result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Compute average runtime and GFLOPs.
|
||||
result.runtime_ms = double(runtime_ms) / double(options.iterations);
|
||||
result.gflops = options.gflops(result.runtime_ms / 1000.0);
|
||||
|
||||
// Cleanup
|
||||
for (auto event : events) {
|
||||
(void)cudaEventDestroy(event);
|
||||
}
|
||||
|
||||
// Create instantiation for device reference gemm kernel
|
||||
cutlass::reference::device::Gemm<ElementInputA,
|
||||
@ -235,12 +404,17 @@ int run() {
|
||||
tensor_d.host_view(),
|
||||
tensor_ref_d.host_view());
|
||||
|
||||
if (passed) {
|
||||
std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
|
||||
std::cout << " GFLOPs: " << result.gflops << std::endl;
|
||||
}
|
||||
|
||||
std::cout << (passed ? "Passed" : "Failed") << std::endl;
|
||||
|
||||
return (passed ? 0 : -1);
|
||||
}
|
||||
|
||||
int main() {
|
||||
int main(int argc, const char **argv) {
|
||||
|
||||
bool notSupported = false;
|
||||
|
||||
@ -272,5 +446,21 @@ int main() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return run();
|
||||
Options options;
|
||||
options.parse(argc, argv);
|
||||
|
||||
if (options.help) {
|
||||
options.print_usage(std::cout) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("%d x %d x %d TF32 tensor op Matrix Multiply\n", \
|
||||
options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
|
||||
|
||||
if (!options.valid()) {
|
||||
std::cerr << "Invalid problem." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
return run(options);
|
||||
}
|
||||
|
||||
@ -152,7 +152,7 @@ int run() {
|
||||
cutlass::HostTensor<ElementInputE, LayoutInputE> tensor_e(
|
||||
cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse / kElementsPerElementE));
|
||||
// Same size as the above. The above one needs to be reordered and stored in this one.
|
||||
cutlass::HostTensor<ElementInputE, ReorderedLayoutInputE> tensor_e_reordered(
|
||||
cutlass::HostTensor<ElementInputE, ReorderedLayoutInputE> tensor_e_reordered(
|
||||
cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse / kElementsPerElementE));
|
||||
|
||||
// Fill input and output matrices on host using CUTLASS helper functions
|
||||
|
||||
@ -158,7 +158,7 @@ using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSw
|
||||
constexpr int NumStages = 3;
|
||||
|
||||
// This code section describe iterator algorithm selected is Analytic or Optimized
|
||||
static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kAnalytic;
|
||||
static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized;
|
||||
|
||||
// This code section describes the epilogue part of the kernel, we use default value
|
||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
|
||||
@ -189,7 +189,6 @@ using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
|
||||
|
||||
using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Command line options parsing
|
||||
@ -755,6 +754,3 @@ int main(int argc, char const **args) {
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
|
||||
|
||||
28
examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
Normal file
28
examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
Normal file
@ -0,0 +1,28 @@
|
||||
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
# provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright notice, this list of
|
||||
# conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
# conditions and the following disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
# to endorse or promote products derived from this software without specific prior written
|
||||
# permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
cutlass_example_add_executable(
|
||||
18_ampere_fp64_tensorop_affine2_gemm
|
||||
ampere_fp64_tensorop_affine2_gemm.cu
|
||||
)
|
||||
|
||||
@ -0,0 +1,336 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/**
|
||||
In the normal GEMM, the fast changing dimension of a matrix always has stride
|
||||
equals to 1, e.g. ColumnMajor and RowMajor matrix. Affine2 matrix can have
|
||||
larger than 1 stride in both dimensions. To support such layout, we need to
|
||||
change to method to visit the global memory:
|
||||
|
||||
1. We can only visit 1 element a time because elements are not stored
|
||||
consecutively anymore. Vectorized load/store is not possible.
|
||||
2. One extra multiplication is needed in calculating the global memory
|
||||
address
|
||||
addr = base_pointer + coord1 * stride1 + coord2 * stride2
|
||||
|
||||
The rest part of GEMM which includes shared memory load/store, mma comutation
|
||||
is the same.
|
||||
|
||||
This example uses Ampere fp64 tensore core Affine2 GEMM as an example. SIMT
|
||||
(e.g. sgemm, dgemm) has support Affine2 layout.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/gemm/device/gemm_universal_adapter.h"
|
||||
#include "cutlass/gemm/kernel/default_gemm_with_k_reduction.h"
|
||||
#include "cutlass/reduction/device/reduce_split_k.h"
|
||||
#include "cutlass/reduction/kernel/reduce_split_k.h"
|
||||
#include "cutlass/reduction/thread/reduction_operators.h"
|
||||
#include "cutlass/matrix_coord.h"
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
#include "cutlass/util/reference/device/gemm.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_copy.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
|
||||
#include "helper.h"
|
||||
|
||||
// The code section below describes datatype for input, output tensors and computation between
|
||||
// elements
|
||||
using ElementAccumulator = double; // Data type of accumulator
|
||||
using ElementComputeEpilogue = ElementAccumulator; // Data type of epilogue computation
|
||||
using ElementInputA = double; // Data type of elements in input tensor
|
||||
using ElementInputB = double; // Data type of elements in input tensor
|
||||
using ElementOutput = double; // Data type of elements in output tensor
|
||||
|
||||
// Since Affine2 explicitly lists the strides of both dimensions, it does not really matter if
|
||||
// it is columnmajor and rowmajor. However, it helps CUTLASS to improve the load locality if
|
||||
// CUTLASS can know which dimension of A/B operand has smaller stride or more dense.
|
||||
//
|
||||
// Affine2 ColumnMajor means the row stride is smaller and Affine2 RowMajor means the column
|
||||
// stride is smaller.
|
||||
//
|
||||
// The Affine2 epilogue reuses AffineN epilogue so it does not need to specify column majore
|
||||
// or row major.
|
||||
using LayoutInputA = cutlass::layout::AffineRank2ColumnMajor;
|
||||
using LayoutInputB = cutlass::layout::AffineRank2RowMajor;
|
||||
using LayoutOutput = cutlass::layout::AffineRankN<2>;
|
||||
|
||||
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
|
||||
using MMAOp = cutlass::arch::OpClassTensorOp;
|
||||
|
||||
// This code section describes CUDA SM architecture number
|
||||
using SmArch = cutlass::arch::Sm80;
|
||||
|
||||
// This code section describes the tile size a thread block will compute
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<128, 128, 16>; // Threadblock tile shape
|
||||
|
||||
// This code section describes tile size a warp will compute
|
||||
using WarpShape = cutlass::gemm::GemmShape<64, 32, 16>; // Warp tile shape
|
||||
|
||||
// This code section describes the size of MMA op
|
||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; // TensorCore instruction shape
|
||||
|
||||
// This code section describes how threadblocks are scheduled on GPU
|
||||
using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>;
|
||||
|
||||
// Number of pipelines you want to use
|
||||
constexpr int NumStages = 3;
|
||||
|
||||
// This code section describes the epilogue part of the kernel, we use default value
|
||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
|
||||
ElementOutput, // Data type of output matrix.
|
||||
1, // The number of elements per memory
|
||||
// access has. It has to be 1 for
|
||||
// affine2.
|
||||
ElementComputeEpilogue>;
|
||||
|
||||
using GemmKernel = typename cutlass::gemm::kernel::DefaultGemmUniversal<
|
||||
ElementInputA, LayoutInputA, cutlass::ComplexTransform::kNone, 1, // AlignmentA has to be 1
|
||||
ElementInputB, LayoutInputB, cutlass::ComplexTransform::kNone, 1, // AlignmentB has to be 1
|
||||
ElementOutput, LayoutOutput,
|
||||
ElementAccumulator,
|
||||
MMAOp,
|
||||
SmArch,
|
||||
ThreadblockShape,
|
||||
WarpShape,
|
||||
InstructionShape,
|
||||
EpilogueOp,
|
||||
SwizzleThreadBlock,
|
||||
NumStages,
|
||||
cutlass::arch::OpMultiplyAdd
|
||||
>::GemmKernel;
|
||||
|
||||
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int run() {
|
||||
|
||||
// Construct Gemm ProblemSize with user defined output size
|
||||
cutlass::gemm::GemmCoord problem_size = {1024, 512, 1024};
|
||||
|
||||
// Stride factor shows the distance between two elements in the differnet dimensions. The
|
||||
// first data is the logical distance between two rows, the second is between two columns.
|
||||
// CUTLASS has a utility tool cutlass::layout::Affine2Layout_Factory<Layout>::layout_factory
|
||||
// to help to convert stride_factor to the two strides.
|
||||
//
|
||||
// It is also totally fine to compute the strides directly without using the utility to
|
||||
// construct the affine2 layout.
|
||||
typename LayoutInputA::Stride::Index stride_factor_A[] = {3, 4};
|
||||
typename LayoutInputB::Stride::Index stride_factor_B[] = {5, 6};
|
||||
typename LayoutOutput::Stride::Index stride_factor_C[] = {7, 8};
|
||||
|
||||
// Initialize tensors using CUTLASS helper functions
|
||||
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(problem_size.mk(),
|
||||
cutlass::layout::Affine2Layout_Factory<LayoutInputA>::layout_factory(problem_size.mk(),
|
||||
stride_factor_A));
|
||||
cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(problem_size.kn(),
|
||||
cutlass::layout::Affine2Layout_Factory<LayoutInputB>::layout_factory(problem_size.kn(),
|
||||
stride_factor_B));
|
||||
|
||||
// Create matrix C used to load for bias addition.
|
||||
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(problem_size.mn(),
|
||||
cutlass::layout::Affine2Layout_Factory<LayoutOutput>::layout_factory(problem_size.mn(),
|
||||
stride_factor_C));
|
||||
|
||||
// Create matrix D used to store output from CUTLASS kernel
|
||||
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(problem_size.mn(),
|
||||
cutlass::layout::Affine2Layout_Factory<LayoutOutput>::layout_factory(problem_size.mn(),
|
||||
stride_factor_C));
|
||||
|
||||
// Create matrix D with dimensions M x N used to store output from reference
|
||||
// kernel
|
||||
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(problem_size.mn(),
|
||||
cutlass::layout::Affine2Layout_Factory<LayoutOutput>::layout_factory(problem_size.mn(),
|
||||
stride_factor_C));
|
||||
|
||||
// Fill input and output matrices on host using CUTLASS helper functions
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
tensor_a.host_view(),
|
||||
1,
|
||||
ElementInputA(4),
|
||||
ElementInputA(-4),
|
||||
0); // <- Fill matrix A on host with uniform-distribution random data
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
tensor_b.host_view(),
|
||||
1,
|
||||
ElementInputB(4),
|
||||
ElementInputB(-4),
|
||||
0); // <- Fill matrix B on host with uniform-distribution random data
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
tensor_c.host_view(),
|
||||
1,
|
||||
ElementOutput(4),
|
||||
ElementOutput(-4),
|
||||
0); // <- Fill matrix C on host with uniform-distribution random data
|
||||
|
||||
cutlass::reference::host::TensorFill(
|
||||
tensor_d.host_view()); // <- fill matrix D on host with zeros
|
||||
cutlass::reference::host::TensorFill(
|
||||
tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros
|
||||
|
||||
// Copy data from host to GPU
|
||||
tensor_a.sync_device();
|
||||
tensor_b.sync_device();
|
||||
tensor_c.sync_device();
|
||||
tensor_d.sync_device();
|
||||
tensor_ref_d.sync_device();
|
||||
|
||||
// Initialize alpha for dot product computation
|
||||
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
|
||||
ElementComputeEpilogue beta = ElementComputeEpilogue(1);
|
||||
|
||||
cutlass::gemm::GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
|
||||
|
||||
int batch_count = 1;
|
||||
|
||||
// Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
|
||||
// instantiated CUTLASS kernel
|
||||
typename Gemm::Arguments arguments{
|
||||
mode,
|
||||
problem_size,
|
||||
batch_count,
|
||||
{alpha, beta},
|
||||
tensor_a.device_ref().data(), // <- reference to matrix A on device
|
||||
tensor_b.device_ref().data(), // <- reference to matrix B on device
|
||||
tensor_c.device_ref().data(), // <- reference to matrix C on device
|
||||
tensor_d.device_ref().data(), // <- reference to matrix D on device
|
||||
tensor_a.layout().capacity(problem_size.mn()),
|
||||
tensor_b.layout().capacity(problem_size.kn()),
|
||||
tensor_c.layout().capacity(problem_size.mn()),
|
||||
tensor_d.layout().capacity(problem_size.mn()),
|
||||
tensor_a.layout().stride(),
|
||||
tensor_b.layout().stride(),
|
||||
tensor_c.layout().stride(),
|
||||
tensor_d.layout().stride()
|
||||
};
|
||||
|
||||
// Instantiate CUTLASS kernel depending on templates
|
||||
Gemm gemm_op;
|
||||
|
||||
// Using the arguments, query for extra workspace required for matrix multiplication computation
|
||||
size_t workspace_size = Gemm::get_workspace_size(arguments);
|
||||
|
||||
// Allocate workspace memory
|
||||
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
|
||||
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = gemm_op.can_implement(arguments);
|
||||
CUTLASS_CHECK(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = gemm_op.initialize(arguments, workspace.get());
|
||||
CUTLASS_CHECK(status);
|
||||
|
||||
// Launch initialized CUTLASS kernel
|
||||
status = gemm_op();
|
||||
|
||||
CUTLASS_CHECK(status);
|
||||
|
||||
//
|
||||
// Create instantiation for device reference gemm kernel
|
||||
//
|
||||
|
||||
// Launch device reference to compute strictly the product A * B
|
||||
cutlass::reference::device::Gemm<
|
||||
ElementInputA,
|
||||
LayoutInputA,
|
||||
ElementInputB,
|
||||
LayoutInputB,
|
||||
ElementOutput,
|
||||
LayoutOutput,
|
||||
ElementComputeEpilogue,
|
||||
ElementAccumulator> gemm_device;
|
||||
|
||||
gemm_device
|
||||
(
|
||||
problem_size,
|
||||
alpha,
|
||||
tensor_a.device_ref(),
|
||||
tensor_b.device_ref(),
|
||||
beta,
|
||||
tensor_c.device_ref(),
|
||||
tensor_ref_d.device_ref()
|
||||
);
|
||||
|
||||
// Wait for kernels to finish
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Copy output data from CUTLASS and reference kernel to host for comparison
|
||||
tensor_d.sync_host();
|
||||
tensor_ref_d.sync_host();
|
||||
|
||||
bool pass = cutlass::reference::host::TensorEquals(tensor_d.host_view(),
|
||||
tensor_ref_d.host_view());
|
||||
|
||||
// Check if output from CUTLASS kernel and reference kernel are equal or not
|
||||
std::cout << (pass
|
||||
? "Passed"
|
||||
: "Failed")
|
||||
<< std::endl;
|
||||
|
||||
CUTLASS_CHECK(status);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
bool notSupported = false;
|
||||
|
||||
// Ampere Tensor Core operations exposed with mma.sync are first available in CUDA 11.0.
|
||||
//
|
||||
// CUTLASS must be compiled with CUDA 11 Toolkit to run Conv2dFprop examples.
|
||||
if (!(__CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))) {
|
||||
std::cerr << "Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
|
||||
notSupported = true;
|
||||
}
|
||||
|
||||
cudaDeviceProp props;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
|
||||
|
||||
if (!(props.major > 8 || (props.major == 8 && props.minor >= 0))) {
|
||||
std::cerr << "Ampere Tensor Ops must be run on a machine with compute capability at least 80."
|
||||
<< std::endl;
|
||||
notSupported = true;
|
||||
}
|
||||
|
||||
if (notSupported) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return run();
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
27
examples/19_tensorop_canonical/CMakeLists.txt
Normal file
27
examples/19_tensorop_canonical/CMakeLists.txt
Normal file
@ -0,0 +1,27 @@
|
||||
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
# provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright notice, this list of
|
||||
# conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
# conditions and the following disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
# to endorse or promote products derived from this software without specific prior written
|
||||
# permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cutlass_example_add_executable(
|
||||
19_tensorop_canonical
|
||||
tensorop_canonical.cu
|
||||
)
|
||||
|
||||
432
examples/19_tensorop_canonical/tensorop_canonical.cu
Normal file
432
examples/19_tensorop_canonical/tensorop_canonical.cu
Normal file
@ -0,0 +1,432 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*
|
||||
This example requires NVIDIA Ampere GPU or later.
|
||||
*/
|
||||
|
||||
// Standard Library includes
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
// CUTLASS Includes
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/functional.h"
|
||||
#include "cutlass/layout/matrix.h"
|
||||
#include "cutlass/gemm/warp/default_mma_tensor_op.h"
|
||||
#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
|
||||
#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
|
||||
|
||||
// CUTLASS Utility Includes
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/reference/host/gemm_complex.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Define the overal warp-level problem shape
|
||||
int const kM = 27;
|
||||
int const kN = 31;
|
||||
int const kK = 17;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Define a warp-level GEMM operator.
|
||||
//
|
||||
// This template could be part of the CUTLASS Template Library or implemented internally. This
|
||||
// wraps the matrix multiply operation and epilogue with a GEMM-like interface that can be
|
||||
// instantiated in device code.
|
||||
|
||||
namespace cutlass {
|
||||
namespace gemm {
|
||||
namespace warp {
|
||||
|
||||
template <
|
||||
typename Shape,
|
||||
typename InstructionShape,
|
||||
typename ElementA,
|
||||
typename LayoutA,
|
||||
typename ElementB,
|
||||
typename LayoutB,
|
||||
typename ElementC,
|
||||
typename LayoutC,
|
||||
typename ElementScalar
|
||||
>
|
||||
class GemmTensorOp {
|
||||
public:
|
||||
|
||||
using WarpShape = GemmShape<
|
||||
((Shape::kM + InstructionShape::kM - 1) / InstructionShape::kM) * InstructionShape::kM,
|
||||
((Shape::kN + InstructionShape::kN - 1) / InstructionShape::kN) * InstructionShape::kN,
|
||||
InstructionShape::kK
|
||||
>;
|
||||
|
||||
using MmaWarp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
|
||||
WarpShape,
|
||||
InstructionShape,
|
||||
double, // Data type of A elements
|
||||
cutlass::layout::RowMajor, // Layout of A matrix
|
||||
double, // Data type of B elements
|
||||
cutlass::layout::ColumnMajor, // Layout of B matrix
|
||||
double, // Data type of C elements
|
||||
cutlass::layout::RowMajor // Layout of C matrix
|
||||
>::Type;
|
||||
|
||||
// Number of 'K groups'
|
||||
int const kKgroups = (Shape::kK + InstructionShape::kK - 1) / InstructionShape::kK;
|
||||
|
||||
// Define a 'FragmentIterator' to iterate over slices of accumulators
|
||||
using FragmentIterator = cutlass::epilogue::warp::FragmentIteratorTensorOp<
|
||||
typename MmaWarp::Shape,
|
||||
InstructionShape,
|
||||
double,
|
||||
typename MmaWarp::Policy::Operator::FragmentC,
|
||||
cutlass::layout::RowMajor
|
||||
>;
|
||||
|
||||
// Define an epilogue 'Tile Iteterator' to iterate over slices of elements in Shared Memory
|
||||
using AccumulatorTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpCanonical<
|
||||
typename MmaWarp::Shape,
|
||||
InstructionShape,
|
||||
double,
|
||||
cutlass::layout::RowMajor
|
||||
>;
|
||||
|
||||
using TensorRefA = typename MmaWarp::IteratorA::TensorRef;
|
||||
using TensorRefB = typename MmaWarp::IteratorB::TensorRef;
|
||||
using TensorRefC = typename AccumulatorTileIterator::TensorRef;
|
||||
|
||||
public:
|
||||
CUTLASS_HOST_DEVICE
|
||||
GemmTensorOp() { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void operator()(
|
||||
ElementScalar alpha,
|
||||
TensorRefA ref_A,
|
||||
TensorRefB ref_B,
|
||||
ElementScalar beta,
|
||||
TensorRefC ref_C,
|
||||
TensorRefC ref_D,
|
||||
int lane_id) const {
|
||||
|
||||
// Instantiate iterators pointing to slices of the A and B matrices in shared memory
|
||||
typename MmaWarp::IteratorA iter_A(ref_A, {Shape::kM, Shape::kK}, lane_id);
|
||||
typename MmaWarp::IteratorB iter_B(ref_B, {Shape::kK, Shape::kN}, lane_id);
|
||||
|
||||
// Instantiate and clear accumulator tile holding the C matrix
|
||||
typename MmaWarp::FragmentC accum;
|
||||
accum.clear();
|
||||
|
||||
// Instantiate the warp-level matrix multiply operator
|
||||
MmaWarp mma_op;
|
||||
|
||||
// Instantiate fragments holding the slice of the matrix held by each warp
|
||||
typename MmaWarp::FragmentA frag_A[2];
|
||||
typename MmaWarp::FragmentB frag_B[2];
|
||||
|
||||
// Load fragments from shared memory
|
||||
iter_A.load(frag_A[0]);
|
||||
iter_B.load(frag_B[0]);
|
||||
|
||||
++iter_A;
|
||||
++iter_B;
|
||||
|
||||
// Load fragments from shared memory
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int k = 0; k < kKgroups; ++k) {
|
||||
|
||||
// Load fragments from shared memory
|
||||
iter_A.load(frag_A[(k + 1) % 2]);
|
||||
iter_B.load(frag_B[(k + 1) % 2]);
|
||||
|
||||
++iter_A;
|
||||
++iter_B;
|
||||
|
||||
// Compute the matrix multiply
|
||||
mma_op(accum, frag_A[k % 2], frag_B[k % 2], accum);
|
||||
}
|
||||
|
||||
// Instantiate iterators
|
||||
FragmentIterator accum_frag_it(accum);
|
||||
AccumulatorTileIterator source_tile_it(ref_C, {Shape::kM, Shape::kN}, lane_id);
|
||||
AccumulatorTileIterator dest_tile_it(ref_D, {Shape::kM, Shape::kN}, lane_id);
|
||||
|
||||
// Define function objects for linear scaling operation
|
||||
cutlass::multiplies<typename FragmentIterator::Fragment> mul_source;
|
||||
cutlass::multiply_add<typename FragmentIterator::Fragment> mul_add_accumulator;
|
||||
|
||||
// Iterate over the epilogue components
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int idx = 0; idx < FragmentIterator::kIterations; ++idx) {
|
||||
|
||||
// Define storage for slices of the accumulators
|
||||
typename FragmentIterator::Fragment accum_fragment;
|
||||
typename FragmentIterator::Fragment source_fragment;
|
||||
|
||||
// Select a slice of accumulators from the accumulator tile
|
||||
accum_frag_it.load(accum_fragment);
|
||||
++accum_frag_it;
|
||||
|
||||
// Load a corresponding slice from Shared memory
|
||||
source_tile_it.load(source_fragment);
|
||||
++source_tile_it;
|
||||
|
||||
// Compute linear scaling - alpha * AB + beta * C
|
||||
source_fragment = mul_source(beta, source_fragment);
|
||||
accum_fragment = mul_add_accumulator(alpha, accum_fragment, source_fragment);
|
||||
|
||||
// Store the result to shared memory
|
||||
dest_tile_it.store(accum_fragment);
|
||||
++dest_tile_it;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace warp
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Sample kernel demonstrating a collective GEMM operation by a warp on arbitrary matrices held
|
||||
// in Shared Memory.
|
||||
__global__ void kernel(
|
||||
double *D_gmem,
|
||||
double alpha,
|
||||
double const *A_gmem,
|
||||
double const *B_gmem,
|
||||
double beta,
|
||||
double const *C_gmem) {
|
||||
|
||||
// Define several matrices in shared memory
|
||||
__shared__ double A[kM][kK];
|
||||
__shared__ double B[kN][kK];
|
||||
__shared__ double C[kM][kN];
|
||||
|
||||
// Copy data into SMEM
|
||||
if (threadIdx.x == 0) {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (int m = 0; m < kM; ++m) {
|
||||
for (int k = 0; k < kK; ++k) {
|
||||
A[m][k] = A_gmem[m * kK + k];
|
||||
}
|
||||
}
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (int n = 0; n < kN; ++n) {
|
||||
for (int k = 0; k < kK; ++k) {
|
||||
B[n][k] = B_gmem[n * kK + k];
|
||||
}
|
||||
}
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (int m = 0; m < kM; ++m) {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (int n = 0; n < kN; ++n) {
|
||||
C[m][n] = C_gmem[m * kN + n];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
//
|
||||
// Instantiate a warp-level matrix multiply operator given the fundamental instruction shape (8x8x4),
|
||||
// overall shape, data type of each operand, and layout of each operand.
|
||||
//
|
||||
|
||||
using GemmTensorOp = cutlass::gemm::warp::GemmTensorOp<
|
||||
cutlass::gemm::GemmShape<kM, kN, kK>,
|
||||
cutlass::gemm::GemmShape<8, 8, 4>,
|
||||
double, // Data type of A elements
|
||||
cutlass::layout::RowMajor, // Layout of A matrix
|
||||
double, // Data type of B elements
|
||||
cutlass::layout::ColumnMajor, // Layout of B matrix
|
||||
double, // Data type of C elements
|
||||
cutlass::layout::RowMajor, // Layout of C matrix
|
||||
double // Scalar type of alpha and beta
|
||||
>;
|
||||
|
||||
// Instantiate the GEMM operator
|
||||
GemmTensorOp gemm;
|
||||
|
||||
// Execute the warp-level GEMM operation
|
||||
gemm(
|
||||
alpha,
|
||||
{&A[0][0], kK},
|
||||
{&B[0][0], kK},
|
||||
beta,
|
||||
{&C[0][0], kN},
|
||||
{&C[0][0], kN},
|
||||
threadIdx.x);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Copy data into SMEM
|
||||
if (threadIdx.x == 0) {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (int m = 0; m < kM; ++m) {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (int n = 0; n < kN; ++n) {
|
||||
D_gmem[m * kN + n] = C[m][n];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Entry point to canonical warp-level GEMM operation
|
||||
int main(int argc, const char *arg[]) {
|
||||
|
||||
bool notSupported = false;
|
||||
|
||||
// CUTLASS must be compiled with CUDA 11 Toolkit to run these examples.
|
||||
if (!(__CUDACC_VER_MAJOR__ >= 11)) {
|
||||
std::cerr << "NVIDIA Ampere Tensor Core operations must be compiled with CUDA 11.0 Toolkit or later." << std::endl;
|
||||
notSupported = true;
|
||||
}
|
||||
|
||||
cudaDeviceProp props;
|
||||
|
||||
cudaError_t error = cudaGetDeviceProperties(&props, 0);
|
||||
if (error != cudaSuccess) {
|
||||
std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!((props.major * 10 + props.minor) >= 80)) {
|
||||
std::cerr << "This example requires compute capability at least 80."
|
||||
<< std::endl;
|
||||
notSupported = true;
|
||||
}
|
||||
|
||||
if (notSupported) {
|
||||
// Return 0 so tests are considered passing if run on unsupported platforms.
|
||||
return 0;
|
||||
}
|
||||
|
||||
cutlass::HostTensor<double, cutlass::layout::RowMajor> A({kM, kK});
|
||||
cutlass::HostTensor<double, cutlass::layout::ColumnMajor> B({kK, kN});
|
||||
cutlass::HostTensor<double, cutlass::layout::RowMajor> C({kM, kN});
|
||||
cutlass::HostTensor<double, cutlass::layout::RowMajor> D({kM, kN});
|
||||
|
||||
uint64_t seed = 2020;
|
||||
double max = 8;
|
||||
double min = -8;
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
A.host_view(),
|
||||
seed,
|
||||
max,
|
||||
min,
|
||||
0
|
||||
);
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
B.host_view(),
|
||||
seed + 17,
|
||||
max,
|
||||
min,
|
||||
0
|
||||
);
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
C.host_view(),
|
||||
seed + 31,
|
||||
max,
|
||||
min,
|
||||
0
|
||||
);
|
||||
|
||||
A.sync_device();
|
||||
B.sync_device();
|
||||
C.sync_device();
|
||||
D.sync_device();
|
||||
|
||||
dim3 grid(1,1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
double alpha = 2.25;
|
||||
double beta = 1.24;
|
||||
|
||||
kernel<<< grid, block >>>(
|
||||
D.device_data(),
|
||||
alpha,
|
||||
A.device_data(),
|
||||
B.device_data(),
|
||||
beta,
|
||||
C.device_data()
|
||||
);
|
||||
|
||||
cudaError_t result = cudaDeviceSynchronize();
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "Failed to synchronize device after kernel launch." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
D.sync_host();
|
||||
|
||||
// Compute reference on host
|
||||
cutlass::HostTensor<double, cutlass::layout::RowMajor> D_ref({kM, kN}, false);
|
||||
|
||||
cutlass::reference::host::GemmComplex(
|
||||
{kM, kN, kK},
|
||||
alpha,
|
||||
A.host_ref(),
|
||||
cutlass::ComplexTransform::kNone,
|
||||
B.host_ref(),
|
||||
cutlass::ComplexTransform::kNone,
|
||||
beta,
|
||||
C.host_ref(),
|
||||
D_ref.host_ref(),
|
||||
double()
|
||||
);
|
||||
|
||||
// Verify reference matches computed
|
||||
if (!cutlass::reference::host::TensorEquals(
|
||||
D.host_view(),
|
||||
D_ref.host_view())) {
|
||||
|
||||
std::cerr
|
||||
<< "A =\n" << A.host_view()
|
||||
<< "\n\nB = \n" << B.host_view()
|
||||
<< "\n\nC = " << C.host_view()
|
||||
<< "\n\nRef =\n" << D_ref.host_view()
|
||||
<< "\n\nD =\n" << D.host_view() << "\n\n";
|
||||
|
||||
std::cerr << "Error - device results mismatch host reference." << std::endl;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::cout << "Passed" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
27
examples/20_simt_canonical/CMakeLists.txt
Normal file
27
examples/20_simt_canonical/CMakeLists.txt
Normal file
@ -0,0 +1,27 @@
|
||||
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
# provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright notice, this list of
|
||||
# conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
# conditions and the following disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
# to endorse or promote products derived from this software without specific prior written
|
||||
# permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cutlass_example_add_executable(
|
||||
20_simt_canonical
|
||||
simt_canonical.cu
|
||||
)
|
||||
|
||||
419
examples/20_simt_canonical/simt_canonical.cu
Normal file
419
examples/20_simt_canonical/simt_canonical.cu
Normal file
@ -0,0 +1,419 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*
|
||||
This example requires NVIDIA Maxwell GPU or beyond.
|
||||
*/
|
||||
|
||||
// Standard Library includes
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
// CUTLASS Includes
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/core_io.h"
|
||||
#include "cutlass/functional.h"
|
||||
#include "cutlass/layout/matrix.h"
|
||||
#include "cutlass/gemm/warp/mma_simt.h"
|
||||
#include "cutlass/epilogue/warp/fragment_iterator_simt.h"
|
||||
#include "cutlass/epilogue/warp/tile_iterator_simt.h"
|
||||
|
||||
// CUTLASS Utility Includes
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
|
||||
#include "cutlass/util/reference/host/gemm.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/reference/host/tensor_copy.h"
|
||||
#include "cutlass/util/reference/host/gemm_complex.h"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Define the overal warp-level problem shape
|
||||
int const kM = 14;
|
||||
int const kN = 27;
|
||||
int const kK = 17;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Define a warp-level GEMM operator.
|
||||
//
|
||||
// This template could be part of the CUTLASS Template Library or implemented internally. This
|
||||
// wraps the matrix multiply operation and epilogue with a GEMM-like interface that can be
|
||||
// instantiated in device code.
|
||||
|
||||
namespace cutlass {
|
||||
namespace gemm {
|
||||
namespace warp {
|
||||
|
||||
template <
|
||||
typename Shape,
|
||||
typename ElementA,
|
||||
typename LayoutA,
|
||||
typename ElementB,
|
||||
typename LayoutB,
|
||||
typename ElementC,
|
||||
typename LayoutC,
|
||||
typename ElementScalar
|
||||
>
|
||||
class GemmSimt {
|
||||
public:
|
||||
|
||||
|
||||
using Policy = cutlass::gemm::warp::MmaSimtPolicy<
|
||||
cutlass::MatrixShape<4, 8>,
|
||||
cutlass::layout::RowMajorInterleaved<2>,
|
||||
cutlass::gemm::GemmShape<4, 4, 1>
|
||||
>;
|
||||
|
||||
using MmaWarp = cutlass::gemm::warp::MmaSimt<
|
||||
cutlass::gemm::GemmShape<16, 32, 8>,
|
||||
float,
|
||||
cutlass::layout::RowMajor,
|
||||
float,
|
||||
cutlass::layout::ColumnMajor,
|
||||
float,
|
||||
cutlass::layout::RowMajor,
|
||||
Policy
|
||||
>;
|
||||
|
||||
// Number of 'K groups'
|
||||
int const kKgroups = Shape::kK;
|
||||
|
||||
using FragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
|
||||
typename MmaWarp::Shape,
|
||||
typename MmaWarp::ThreadMma,
|
||||
layout::RowMajor, // SMEM layout
|
||||
typename MmaWarp::Policy
|
||||
>;
|
||||
|
||||
using AccumulatorTileIterator = cutlass::epilogue::warp::TileIteratorSimtCanonical<
|
||||
typename MmaWarp::Shape,
|
||||
typename MmaWarp::ThreadMma,
|
||||
float, // ElementAccumulator
|
||||
layout::RowMajor, // SMEM layout
|
||||
typename MmaWarp::Policy
|
||||
>;
|
||||
|
||||
using TensorRefA = typename MmaWarp::IteratorA::TensorRef;
|
||||
using TensorRefB = typename MmaWarp::IteratorB::TensorRef;
|
||||
using TensorRefC = typename AccumulatorTileIterator::TensorRef;
|
||||
|
||||
public:
|
||||
CUTLASS_HOST_DEVICE
|
||||
GemmSimt() { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void operator()(
|
||||
ElementScalar alpha,
|
||||
TensorRefA ref_A,
|
||||
TensorRefB ref_B,
|
||||
ElementScalar beta,
|
||||
TensorRefC ref_C,
|
||||
TensorRefC ref_D,
|
||||
int lane_id) const {
|
||||
|
||||
// Instantiate iterators pointing to slices of the A and B matrices in shared memory
|
||||
typename MmaWarp::IteratorA iter_A(ref_A, {Shape::kM, Shape::kK}, lane_id);
|
||||
typename MmaWarp::IteratorB iter_B(ref_B, {Shape::kK, Shape::kN}, lane_id);
|
||||
|
||||
// Instantiate and clear accumulator tile holding the C matrix
|
||||
typename MmaWarp::FragmentC accum;
|
||||
accum.clear();
|
||||
|
||||
// Instantiate the warp-level matrix multiply operator
|
||||
MmaWarp mma_op;
|
||||
|
||||
// Instantiate fragments holding the slice of the matrix held by each warp
|
||||
typename MmaWarp::FragmentA frag_A[2];
|
||||
typename MmaWarp::FragmentB frag_B[2];
|
||||
|
||||
// Load fragments from shared memory
|
||||
iter_A.load(frag_A[0]);
|
||||
iter_B.load(frag_B[0]);
|
||||
|
||||
++iter_A;
|
||||
++iter_B;
|
||||
|
||||
// Load fragments from shared memory
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int k = 0; k < kKgroups; ++k) {
|
||||
|
||||
// Load fragments from shared memory
|
||||
iter_A.load(frag_A[(k + 1) % 2]);
|
||||
iter_B.load(frag_B[(k + 1) % 2]);
|
||||
|
||||
++iter_A;
|
||||
++iter_B;
|
||||
|
||||
// Compute the matrix multiply
|
||||
mma_op(accum, frag_A[k % 2], frag_B[k % 2], accum);
|
||||
}
|
||||
|
||||
// Instantiate iterators
|
||||
FragmentIterator accum_frag_it(accum);
|
||||
AccumulatorTileIterator source_tile_it(ref_C, {Shape::kM, Shape::kN}, lane_id);
|
||||
AccumulatorTileIterator dest_tile_it(ref_D, {Shape::kM, Shape::kN}, lane_id);
|
||||
|
||||
// Define function objects for linear scaling operation
|
||||
cutlass::multiplies<typename FragmentIterator::Fragment> mul_source;
|
||||
cutlass::multiply_add<typename FragmentIterator::Fragment> mul_add_accumulator;
|
||||
|
||||
// Iterate over the epilogue components
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int idx = 0; idx < FragmentIterator::kIterations; ++idx) {
|
||||
|
||||
// Define storage for slices of the accumulators
|
||||
typename FragmentIterator::Fragment accum_fragment;
|
||||
typename FragmentIterator::Fragment source_fragment;
|
||||
|
||||
// Select a slice of accumulators from the accumulator tile
|
||||
accum_frag_it.load(accum_fragment);
|
||||
++accum_frag_it;
|
||||
|
||||
// Load a corresponding slice from Shared memory
|
||||
source_tile_it.load(source_fragment);
|
||||
++source_tile_it;
|
||||
|
||||
// Compute linear scaling - alpha * AB + beta * C
|
||||
source_fragment = mul_source(beta, source_fragment);
|
||||
accum_fragment = mul_add_accumulator(alpha, accum_fragment, source_fragment);
|
||||
|
||||
// Store the result to shared memory
|
||||
dest_tile_it.store(accum_fragment);
|
||||
++dest_tile_it;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace warp
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Sample kernel demonstrating a collective GEMM operation by a warp on arbitrary matrices held
|
||||
// in Shared Memory.
|
||||
__global__ void kernel(
|
||||
float *D_gmem,
|
||||
float alpha,
|
||||
float const *A_gmem,
|
||||
float const *B_gmem,
|
||||
float beta,
|
||||
float const *C_gmem) {
|
||||
|
||||
// Define several matrices in shared memory
|
||||
__shared__ float A[kM][kK];
|
||||
__shared__ float B[kN][kK];
|
||||
__shared__ float C[kM][kN];
|
||||
|
||||
// Copy data into SMEM
|
||||
if (threadIdx.x == 0) {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (int m = 0; m < kM; ++m) {
|
||||
for (int k = 0; k < kK; ++k) {
|
||||
A[m][k] = A_gmem[m * kK + k];
|
||||
}
|
||||
}
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (int n = 0; n < kN; ++n) {
|
||||
for (int k = 0; k < kK; ++k) {
|
||||
B[n][k] = B_gmem[n * kK + k];
|
||||
}
|
||||
}
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (int m = 0; m < kM; ++m) {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (int n = 0; n < kN; ++n) {
|
||||
C[m][n] = C_gmem[m * kN + n];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
//
|
||||
// Instantiate a warp-level matrix multiply operator given the fundamental instruction shape (8x8x4),
|
||||
// overall shape, data type of each operand, and layout of each operand.
|
||||
//
|
||||
|
||||
using GemmSimt = cutlass::gemm::warp::GemmSimt<
|
||||
cutlass::gemm::GemmShape<kM, kN, kK>,
|
||||
float, // Data type of A elements
|
||||
cutlass::layout::RowMajor, // Layout of A matrix
|
||||
float, // Data type of B elements
|
||||
cutlass::layout::ColumnMajor, // Layout of B matrix
|
||||
float, // Data type of C elements
|
||||
cutlass::layout::RowMajor, // Layout of C matrix
|
||||
float // Scalar type of alpha and beta
|
||||
>;
|
||||
|
||||
// Instantiate the GEMM operator
|
||||
GemmSimt gemm;
|
||||
|
||||
// Execute the warp-level GEMM operation
|
||||
gemm(
|
||||
alpha,
|
||||
{&A[0][0], kK},
|
||||
{&B[0][0], kK},
|
||||
beta,
|
||||
{&C[0][0], kN},
|
||||
{&C[0][0], kN},
|
||||
threadIdx.x);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Copy data into SMEM
|
||||
if (threadIdx.x == 0) {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (int m = 0; m < kM; ++m) {
|
||||
CUTLASS_PRAGMA_NO_UNROLL
|
||||
for (int n = 0; n < kN; ++n) {
|
||||
D_gmem[m * kN + n] = C[m][n];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, const char *arg[]) {
|
||||
|
||||
cutlass::HostTensor<float, cutlass::layout::RowMajor> A({kM, kK});
|
||||
cutlass::HostTensor<float, cutlass::layout::ColumnMajor> B({kK, kN});
|
||||
cutlass::HostTensor<float, cutlass::layout::RowMajor> C({kM, kN});
|
||||
cutlass::HostTensor<float, cutlass::layout::RowMajor> D({kM, kN});
|
||||
|
||||
uint64_t seed = 2020;
|
||||
float max = 8;
|
||||
float min = -8;
|
||||
|
||||
std::cout << "Simt canonical GEMM problem size = (" << cutlass::gemm::GemmShape<kM, kN, kK>() <<")" << std::endl;
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
A.host_view(),
|
||||
seed,
|
||||
max,
|
||||
min,
|
||||
0
|
||||
);
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
B.host_view(),
|
||||
seed + 17,
|
||||
max,
|
||||
min,
|
||||
0
|
||||
);
|
||||
|
||||
#if 0 // Debug: fill A sequentially and B as Identity matrix for debugging
|
||||
cutlass::reference::host::BlockFillSequential(
|
||||
A.host_view().data(), A.host_view().capacity());
|
||||
|
||||
cutlass::reference::host::TensorFillIdentity(B.host_view());
|
||||
#endif
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
C.host_view(),
|
||||
seed + 31,
|
||||
max,
|
||||
min,
|
||||
0
|
||||
);
|
||||
|
||||
A.sync_device();
|
||||
B.sync_device();
|
||||
C.sync_device();
|
||||
D.sync_device();
|
||||
|
||||
dim3 grid(1, 1);
|
||||
dim3 block(32, 1, 1);
|
||||
|
||||
float alpha = 1.0f;
|
||||
float beta = 0.0f;
|
||||
|
||||
kernel<<< grid, block >>>(
|
||||
D.device_data(),
|
||||
alpha,
|
||||
A.device_data(),
|
||||
B.device_data(),
|
||||
beta,
|
||||
C.device_data()
|
||||
);
|
||||
|
||||
cudaError_t result = cudaDeviceSynchronize();
|
||||
if (result != cudaSuccess) {
|
||||
std::cerr << "Failed to synchronize device after kernel launch." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
D.sync_host();
|
||||
|
||||
// Compute reference on host
|
||||
cutlass::HostTensor<float, cutlass::layout::RowMajor> D_ref({kM, kN}, false);
|
||||
cutlass::reference::host::TensorCopy(D_ref.host_view(), C.host_view());
|
||||
|
||||
cutlass::reference::host::Gemm<
|
||||
float, cutlass::layout::RowMajor,
|
||||
float, cutlass::layout::ColumnMajor,
|
||||
float, cutlass::layout::RowMajor,
|
||||
float, float> reference_gemm;
|
||||
|
||||
reference_gemm(
|
||||
{kM, kN, kK},
|
||||
alpha,
|
||||
A.host_ref(),
|
||||
B.host_ref(),
|
||||
beta,
|
||||
D_ref.host_ref(),
|
||||
float()
|
||||
);
|
||||
|
||||
// Verify reference matches computed
|
||||
if (!cutlass::reference::host::TensorEquals(
|
||||
D.host_view(),
|
||||
D_ref.host_view())) {
|
||||
|
||||
std::cerr
|
||||
<< "A =\n" << A.host_view()
|
||||
<< "\n\nB = \n" << B.host_view()
|
||||
<< "\n\nC = " << C.host_view()
|
||||
<< "\n\nRef =\n" << D_ref.host_view()
|
||||
<< "\n\nD =\n" << D.host_view() << "\n\n";
|
||||
|
||||
std::cerr << "Error - device results mismatch host reference." << std::endl;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::cout << "Passed" << std::endl;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
27
examples/21_quaternion_gemm/CMakeLists.txt
Normal file
27
examples/21_quaternion_gemm/CMakeLists.txt
Normal file
@ -0,0 +1,27 @@
|
||||
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
# provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright notice, this list of
|
||||
# conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
# conditions and the following disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
# to endorse or promote products derived from this software without specific prior written
|
||||
# permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cutlass_example_add_executable(
|
||||
21_quaternion_gemm
|
||||
quaternion_gemm.cu
|
||||
)
|
||||
|
||||
448
examples/21_quaternion_gemm/quaternion_gemm.cu
Normal file
448
examples/21_quaternion_gemm/quaternion_gemm.cu
Normal file
@ -0,0 +1,448 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/gemm/device/gemm.h"
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/reference/device/gemm.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_copy.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
|
||||
#include "helper.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Result structure
|
||||
struct Result {
|
||||
|
||||
double runtime_ms;
|
||||
double gflops;
|
||||
cutlass::Status status;
|
||||
cudaError_t error;
|
||||
bool passed;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
Result(
|
||||
double runtime_ms = 0,
|
||||
double gflops = 0,
|
||||
cutlass::Status status = cutlass::Status::kSuccess,
|
||||
cudaError_t error = cudaSuccess
|
||||
):
|
||||
runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Command line options parsing
|
||||
struct Options {
|
||||
|
||||
bool help;
|
||||
|
||||
cutlass::gemm::GemmCoord problem_size;
|
||||
int batch_count;
|
||||
cutlass::Quaternion<float> alpha;
|
||||
cutlass::Quaternion<float> beta;
|
||||
|
||||
bool reference_check;
|
||||
int iterations;
|
||||
|
||||
Options():
|
||||
help(false),
|
||||
problem_size({1024, 1024, 1024}),
|
||||
batch_count(1),
|
||||
reference_check(true),
|
||||
iterations(20),
|
||||
alpha(1),
|
||||
beta() { }
|
||||
|
||||
bool valid() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Parses the command line
|
||||
void parse(int argc, char const **args) {
|
||||
cutlass::CommandLine cmd(argc, args);
|
||||
|
||||
if (cmd.check_cmd_line_flag("help")) {
|
||||
help = true;
|
||||
}
|
||||
|
||||
cmd.get_cmd_line_argument("m", problem_size.m());
|
||||
cmd.get_cmd_line_argument("n", problem_size.n());
|
||||
cmd.get_cmd_line_argument("k", problem_size.k());
|
||||
cmd.get_cmd_line_argument("batch", batch_count);
|
||||
|
||||
cmd.get_cmd_line_argument("alpha", alpha.w());
|
||||
cmd.get_cmd_line_argument("alpha_i", alpha.x());
|
||||
cmd.get_cmd_line_argument("alpha_j", alpha.y());
|
||||
cmd.get_cmd_line_argument("alpha_k", alpha.z());
|
||||
|
||||
cmd.get_cmd_line_argument("beta", beta.w());
|
||||
cmd.get_cmd_line_argument("beta_i", beta.x());
|
||||
cmd.get_cmd_line_argument("beta_j", beta.y());
|
||||
cmd.get_cmd_line_argument("beta_k", beta.z());
|
||||
|
||||
cmd.get_cmd_line_argument("iterations", iterations);
|
||||
|
||||
}
|
||||
|
||||
/// Prints the usage statement.
|
||||
std::ostream & print_usage(std::ostream &out) const {
|
||||
|
||||
out << "21_quaternion_gemm example\n\n"
|
||||
<< " This example uses the CUTLASS Library to execute Quaternion GEMM computations.\n\n"
|
||||
<< "Options:\n\n"
|
||||
<< " --help If specified, displays this usage statement.\n\n"
|
||||
<< " --m <int> GEMM M dimension\n"
|
||||
<< " --n <int> GEMM N dimension\n"
|
||||
<< " --k <int> GEMM K dimension\n"
|
||||
<< " --batch <int> Number of GEMM operations executed in one batch\n"
|
||||
<< " --alpha <f32> Epilogue scalar alpha (real part)\n"
|
||||
<< " --alpha_i <f32> Epilogue scalar alpha_i (imaginary part)\n"
|
||||
<< " --alpha_j <f32> Epilogue scalar alpha_j (imaginary part)\n"
|
||||
<< " --alpha_k <f32> Epilogue scalar alpha_k (imaginary part)\n"
|
||||
<< " --beta <f32> Epilogue scalar beta (real part)\n\n"
|
||||
<< " --beta_i <f32> Epilogue scalar beta_i (imaginary part)\n\n"
|
||||
<< " --beta_j <f32> Epilogue scalar beta_j (imaginary part)\n\n"
|
||||
<< " --beta_k <f32> Epilogue scalar beta_k (imaginary part)\n\n"
|
||||
<< " --iterations <int> Number of profiling iterations to perform.\n\n";
|
||||
|
||||
out << "\n\nExamples:\n\n"
|
||||
<< "$ ./examples/21_quaternion_gemm/21_quaternion_gemm --batch=7 --m=1024 --n=512 --k=1024 \\\n"
|
||||
<< " --alpha=2 --alpha_i=-2 --beta=0.707 --beta_i=-.707\n\n";
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/// Compute performance in GFLOP/s
|
||||
double gflops(double runtime_s) const {
|
||||
|
||||
// Number of real-valued multiply-adds
|
||||
int64_t fmas = problem_size.product() * batch_count * 16;
|
||||
|
||||
// Two flops per multiply-add
|
||||
return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// The code section below describes datatype for input, output matrices and computation between
|
||||
// elements in input matrices.
|
||||
using precision = float;
|
||||
using Element = cutlass::Quaternion<float>;
|
||||
using ElementComputeEpilogue = Element; // <- data type of epilogue operations
|
||||
using ElementAccumulator = Element; // <- data type of accumulator
|
||||
using ElementInputA = Element; // <- data type of elements in input matrix A
|
||||
using ElementInputB = Element; // <- data type of elements in input matrix B
|
||||
using ElementOutput = Element; // <- data type of elements in output matrix D
|
||||
|
||||
// The code section below describes matrix layout of input and output matrices. Column Major for
|
||||
// Matrix A, Row Major for Matrix B and Row Major for Matrix C
|
||||
using LayoutInputA = cutlass::layout::RowMajor;
|
||||
using LayoutInputB = cutlass::layout::ColumnMajor;
|
||||
using LayoutOutput = cutlass::layout::RowMajor;
|
||||
|
||||
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
|
||||
using MMAOp = cutlass::arch::OpClassSimt;
|
||||
|
||||
// This code section describes CUDA SM architecture number
|
||||
using SmArch = cutlass::arch::Sm50;
|
||||
|
||||
// This code section describes the tile size a thread block will compute
|
||||
using ShapeMMAThreadBlock =
|
||||
cutlass::gemm::GemmShape<64, 64, 4>; // <- threadblock tile M = 64, N = 64, K = 8
|
||||
// This code section describes tile size a warp will compute
|
||||
using ShapeMMAWarp = cutlass::gemm::GemmShape<32, 16, 4>; // <- warp tile M = 32, N = 16, K = 8
|
||||
// This code section describes the size of MMA op
|
||||
using ShapeMMAOp = cutlass::gemm::GemmShape<1, 1, 1>; // <- MMA Op tile M = 1, N = 1, K = 1
|
||||
|
||||
// This code section describes how threadblocks are scheduled on GPU
|
||||
using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- Defaults
|
||||
|
||||
// This code section describes the epilogue part of the kernel
|
||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
|
||||
ElementOutput, // <- data type of output matrix
|
||||
128 / cutlass::sizeof_bits<ElementOutput>::value, // <- the number of elements per vectorized
|
||||
// memory access. For a byte, it's 16
|
||||
// elements. This becomes the vector width of
|
||||
// math instructions in the epilogue too
|
||||
ElementAccumulator, // <- data type of accumulator
|
||||
ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function
|
||||
|
||||
// Number of pipelines you want to use
|
||||
constexpr int NumStages = 2;
|
||||
|
||||
using Gemm = cutlass::gemm::device::Gemm<ElementInputA,
|
||||
LayoutInputA,
|
||||
ElementInputB,
|
||||
LayoutInputB,
|
||||
ElementOutput,
|
||||
LayoutOutput,
|
||||
ElementAccumulator,
|
||||
MMAOp,
|
||||
SmArch,
|
||||
ShapeMMAThreadBlock,
|
||||
ShapeMMAWarp,
|
||||
ShapeMMAOp,
|
||||
EpilogueOp,
|
||||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
|
||||
int run(Options options) {
|
||||
|
||||
// PASS/FAIL status
|
||||
bool passed = true;
|
||||
|
||||
// Create a tuple of problem size for matrix multiplication
|
||||
cutlass::gemm::GemmCoord problem_size = options.problem_size;
|
||||
|
||||
// Initialize tensors using CUTLASS helper functions
|
||||
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(
|
||||
problem_size.mk()); // <- Create matrix A with dimensions M x K
|
||||
cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(
|
||||
problem_size.kn()); // <- Create matrix B with dimensions K x N
|
||||
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(
|
||||
problem_size.mn()); // <- Create matrix C with dimensions M x N
|
||||
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_d(
|
||||
problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
|
||||
// CUTLASS kernel
|
||||
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
|
||||
problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from
|
||||
// reference kernel
|
||||
|
||||
// Fill input and output matrices on host using CUTLASS helper functions
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
tensor_a.host_view(),
|
||||
1,
|
||||
4,
|
||||
-4,
|
||||
0); // <- Fill matrix A on host with uniform-distribution random data
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
tensor_b.host_view(),
|
||||
1,
|
||||
4,
|
||||
-4,
|
||||
0); // <- Fill matrix B on host with uniform-distribution random data
|
||||
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
tensor_c.host_view(),
|
||||
1,
|
||||
4,
|
||||
-4,
|
||||
0); // <- Fill matrix C on host with uniform-distribution random data
|
||||
|
||||
cutlass::reference::host::TensorFill(
|
||||
tensor_d.host_view()); // <- fill matrix D on host with zeros
|
||||
cutlass::reference::host::TensorFill(
|
||||
tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros
|
||||
|
||||
// Copy data from host to GPU
|
||||
tensor_a.sync_device();
|
||||
tensor_b.sync_device();
|
||||
tensor_c.sync_device();
|
||||
tensor_d.sync_device();
|
||||
tensor_ref_d.sync_device();
|
||||
|
||||
// Initialize alpha and beta for dot product computation
|
||||
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
|
||||
ElementComputeEpilogue beta = ElementComputeEpilogue(0);
|
||||
|
||||
// Split K dimension into 1 partitions
|
||||
int split_k_slices = 1;
|
||||
|
||||
// Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
|
||||
// instantiated CUTLASS kernel
|
||||
typename Gemm::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
tensor_a.device_ref(), // <- reference to matrix A on device
|
||||
tensor_b.device_ref(), // <- reference to matrix B on device
|
||||
tensor_c.device_ref(), // <- reference to matrix C on device
|
||||
tensor_d.device_ref(), // <- reference to matrix D on device
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
split_k_slices}; // <- k-dimension split factor
|
||||
|
||||
// Using the arguments, query for extra workspace required for matrix multiplication computation
|
||||
size_t workspace_size = Gemm::get_workspace_size(arguments);
|
||||
|
||||
// Allocate workspace memory
|
||||
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
|
||||
|
||||
// Instantiate CUTLASS kernel depending on templates
|
||||
Gemm gemm_op;
|
||||
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = gemm_op.can_implement(arguments);
|
||||
CUTLASS_CHECK(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = gemm_op.initialize(arguments, workspace.get());
|
||||
CUTLASS_CHECK(status);
|
||||
|
||||
// Result structure
|
||||
Result result;
|
||||
|
||||
//
|
||||
// Construct events
|
||||
//
|
||||
|
||||
cudaEvent_t events[2];
|
||||
|
||||
for (auto & event : events) {
|
||||
result.error = cudaEventCreate(&event);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Record an event at the start of a series of GEMMs
|
||||
result.error = cudaEventRecord(events[0]);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
//
|
||||
// Run profiling loop
|
||||
//
|
||||
|
||||
for (int iter = 0; iter < options.iterations; ++iter) {
|
||||
|
||||
// Launch initialized CUTLASS kernel
|
||||
status = gemm_op();
|
||||
CUTLASS_CHECK(status);
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// Stop profiling loop
|
||||
//
|
||||
|
||||
// Record an event when the GEMMs are complete
|
||||
result.error = cudaEventRecord(events[1]);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Wait for work on the device to complete.
|
||||
result.error = cudaEventSynchronize(events[1]);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Measure elapsed runtime
|
||||
float runtime_ms = 0;
|
||||
result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Compute average runtime and GFLOPs.
|
||||
result.runtime_ms = double(runtime_ms) / double(options.iterations);
|
||||
result.gflops = options.gflops(result.runtime_ms / 1000.0);
|
||||
|
||||
// Cleanup
|
||||
for (auto event : events) {
|
||||
(void)cudaEventDestroy(event);
|
||||
}
|
||||
|
||||
if (options.reference_check) {
|
||||
|
||||
// Create instantiation for device reference gemm kernel
|
||||
cutlass::reference::device::Gemm<ElementInputA,
|
||||
LayoutInputA,
|
||||
ElementInputB,
|
||||
LayoutInputB,
|
||||
ElementOutput,
|
||||
LayoutOutput,
|
||||
ElementComputeEpilogue,
|
||||
ElementComputeEpilogue> gemm_device;
|
||||
|
||||
// Launch device reference gemm kernel
|
||||
gemm_device(problem_size,
|
||||
alpha,
|
||||
tensor_a.device_ref(),
|
||||
tensor_b.device_ref(),
|
||||
beta,
|
||||
tensor_c.device_ref(),
|
||||
tensor_ref_d.device_ref());
|
||||
|
||||
// Wait for kernels to finish
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Copy output data from CUTLASS and reference kernel to host for comparison
|
||||
tensor_d.sync_host();
|
||||
tensor_ref_d.sync_host();
|
||||
|
||||
// Check if output from CUTLASS kernel and reference kernel are equal or not
|
||||
passed &= cutlass::reference::host::TensorEquals(
|
||||
tensor_d.host_view(),
|
||||
tensor_ref_d.host_view());
|
||||
|
||||
}
|
||||
|
||||
if (passed) {
|
||||
std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
|
||||
std::cout << " GFLOPs: " << result.gflops << std::endl;
|
||||
}
|
||||
|
||||
std::cout << (passed ? "Passed" : "Failed") << std::endl;
|
||||
return (passed ? 0 : -1);
|
||||
}
|
||||
|
||||
int main(int argc, char const** argv) {
|
||||
|
||||
Options options;
|
||||
options.parse(argc, argv);
|
||||
|
||||
if (options.help) {
|
||||
options.print_usage(std::cout) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("%d x %d x %d Single Precision Quaternion Matrix Multiply\n", \
|
||||
options.problem_size.m(), options.problem_size.n(), options.problem_size.k());
|
||||
|
||||
if (!options.valid()) {
|
||||
std::cerr << "Invalid problem." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
return run(options);
|
||||
}
|
||||
|
||||
28
examples/22_quaternion_conv/CMakeLists.txt
Normal file
28
examples/22_quaternion_conv/CMakeLists.txt
Normal file
@ -0,0 +1,28 @@
|
||||
# Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
# provided that the following conditions are met:
|
||||
# * Redistributions of source code must retain the above copyright notice, this list of
|
||||
# conditions and the following disclaimer.
|
||||
# * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
# conditions and the following disclaimer in the documentation and/or other materials
|
||||
# provided with the distribution.
|
||||
# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
# to endorse or promote products derived from this software without specific prior written
|
||||
# permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
cutlass_example_add_executable(
|
||||
22_quaternion_conv
|
||||
quaternion_conv.cu
|
||||
)
|
||||
|
||||
660
examples/22_quaternion_conv/quaternion_conv.cu
Normal file
660
examples/22_quaternion_conv/quaternion_conv.cu
Normal file
@ -0,0 +1,660 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted
|
||||
* provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of
|
||||
* conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of
|
||||
* conditions and the following disclaimer in the documentation and/or other materials
|
||||
* provided with the distribution.
|
||||
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
|
||||
* to endorse or promote products derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
||||
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
|
||||
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/gemm/device/gemm.h"
|
||||
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
|
||||
#include "cutlass/conv/device/implicit_gemm_convolution.h"
|
||||
|
||||
#include "cutlass/util/command_line.h"
|
||||
#include "cutlass/util/host_tensor.h"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
#include "cutlass/util/reference/device/gemm.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/reference/host/tensor_copy.h"
|
||||
#include "cutlass/util/reference/host/tensor_fill.h"
|
||||
#include "cutlass/util/reference/host/convolution.h"
|
||||
#include "cutlass/util/tensor_view_io.h"
|
||||
|
||||
#include "helper.h"
|
||||
|
||||
// The code section below describes datatype for input, output tensors and computation between
|
||||
// elements
|
||||
using Element = cutlass::Quaternion<float>;
|
||||
using ElementAccumulator = Element; // Data type of accumulator
|
||||
using ElementComputeEpilogue = Element; // Data type of epilogue computation (alpha, beta)
|
||||
using ElementInputA = Element; // Data type of elements in input tensor
|
||||
using ElementInputB = Element; // Data type of elements in input tensor
|
||||
using ElementOutput = Element; // Data type of elements in output tensor
|
||||
|
||||
using LayoutInputA = cutlass::layout::TensorNHWC;
|
||||
using LayoutInputB = cutlass::layout::TensorNHWC;
|
||||
using LayoutOutput = cutlass::layout::TensorNHWC;
|
||||
|
||||
// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM
|
||||
using MMAOp = cutlass::arch::OpClassSimt;
|
||||
|
||||
// This code section describes CUDA SM architecture number
|
||||
using SmArch = cutlass::arch::Sm50;
|
||||
|
||||
// This code section describes the tile size a thread block will compute
|
||||
using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 8>; // Threadblock tile shape
|
||||
|
||||
// This code section describes tile size a warp will compute
|
||||
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>; // Warp tile shape
|
||||
|
||||
// This code section describes the size of MMA op
|
||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; // SIMT instruction shape
|
||||
|
||||
// This code section describes how threadblocks are scheduled on GPU
|
||||
using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
|
||||
|
||||
// Number of pipelines you want to use
|
||||
constexpr int NumStages = 2;
|
||||
|
||||
// This code section describe iterator algorithm selected is Analytic or Optimized
|
||||
static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm = cutlass::conv::IteratorAlgorithm::kOptimized;
|
||||
|
||||
// This code section describes the epilogue part of the kernel, we use default value
|
||||
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
|
||||
ElementOutput, // Data type of output matrix.
|
||||
128 / cutlass::sizeof_bits<ElementOutput>::value, // The number of elements per vectorized.
|
||||
// memory access. This becomes the vector width of
|
||||
// math instructions in the epilogue too.
|
||||
ElementAccumulator, // Data type of accumulator
|
||||
ElementComputeEpilogue>; // Data type for alpha/beta in linear combination
|
||||
|
||||
|
||||
using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
|
||||
ElementInputA, LayoutInputA,
|
||||
ElementInputB, LayoutInputB,
|
||||
ElementOutput, LayoutOutput,
|
||||
ElementAccumulator,
|
||||
MMAOp,
|
||||
SmArch,
|
||||
ThreadblockShape,
|
||||
WarpShape,
|
||||
InstructionShape,
|
||||
EpilogueOp,
|
||||
SwizzleThreadBlock,
|
||||
NumStages,
|
||||
cutlass::arch::OpMultiplyAdd,
|
||||
IteratorAlgorithm
|
||||
>::Kernel;
|
||||
|
||||
using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Command line options parsing
|
||||
struct Options {
|
||||
|
||||
bool help;
|
||||
cutlass::Tensor4DCoord input_size;
|
||||
cutlass::Tensor4DCoord filter_size;
|
||||
cutlass::Tensor4DCoord padding;
|
||||
cutlass::MatrixCoord conv_stride;
|
||||
cutlass::MatrixCoord dilation;
|
||||
bool reference_check;
|
||||
bool measure_performance;
|
||||
int iterations;
|
||||
bool save_workspace;
|
||||
ElementComputeEpilogue alpha;
|
||||
ElementComputeEpilogue beta;
|
||||
bool benchmark;
|
||||
std::string tag;
|
||||
|
||||
Options():
|
||||
help(false),
|
||||
input_size(1, 32, 32, 32),
|
||||
filter_size(32, 3, 3, 32),
|
||||
padding(1, 1, 1, 1),
|
||||
conv_stride(1, 1),
|
||||
dilation(1, 1),
|
||||
reference_check(false),
|
||||
measure_performance(true),
|
||||
iterations(20),
|
||||
save_workspace(false),
|
||||
alpha(1),
|
||||
beta(0),
|
||||
benchmark(false) { }
|
||||
|
||||
// Verify the problem size is compatible with the CUTLASS Convolution implementation.
|
||||
bool valid() {
|
||||
|
||||
//
|
||||
// CUTLASS attempts to load 128b vectors of cutlass::half_t (F16) elements. Consequently,
|
||||
// all pointers, strides, and tensor extents must be divisible by 8 elements.
|
||||
//
|
||||
int const kAlignment = 8;
|
||||
|
||||
if ((input_size.c() % kAlignment) ||
|
||||
(filter_size.n() % kAlignment)) {
|
||||
|
||||
// misaligned tensors
|
||||
return false;
|
||||
}
|
||||
|
||||
// Invalid padding
|
||||
if ((padding.h() != filter_size.h() / 2) ||
|
||||
(padding.w() != filter_size.w() / 2)) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Updates input and filter sizes
|
||||
void update(
|
||||
cutlass::Tensor4DCoord input_size,
|
||||
cutlass::Tensor4DCoord filter_size) {
|
||||
|
||||
this->input_size = input_size;
|
||||
this->filter_size = filter_size;
|
||||
|
||||
padding.n() = filter_size.h() / 2;
|
||||
padding.h() = filter_size.h() / 2;
|
||||
padding.w() = filter_size.w() / 2;
|
||||
padding.c() = filter_size.w() / 2;
|
||||
}
|
||||
|
||||
// Parses the command line
|
||||
void parse(int argc, char const **args) {
|
||||
cutlass::CommandLine cmd(argc, args);
|
||||
|
||||
if (cmd.check_cmd_line_flag("help")) {
|
||||
help = true;
|
||||
}
|
||||
|
||||
if (cmd.check_cmd_line_flag("ref-check")) {
|
||||
reference_check = true;
|
||||
}
|
||||
|
||||
if (cmd.check_cmd_line_flag("perf-check")) {
|
||||
measure_performance = true;
|
||||
}
|
||||
|
||||
if (cmd.check_cmd_line_flag("save-workspace")) {
|
||||
save_workspace = true;
|
||||
}
|
||||
|
||||
if (cmd.check_cmd_line_flag("benchmark")) {
|
||||
benchmark = true;
|
||||
}
|
||||
|
||||
cmd.get_cmd_line_argument("n", input_size.n());
|
||||
cmd.get_cmd_line_argument("h", input_size.h());
|
||||
cmd.get_cmd_line_argument("w", input_size.w());
|
||||
cmd.get_cmd_line_argument("c", input_size.c());
|
||||
|
||||
cmd.get_cmd_line_argument("k", filter_size.n());
|
||||
cmd.get_cmd_line_argument("r", filter_size.h());
|
||||
cmd.get_cmd_line_argument("s", filter_size.w());
|
||||
filter_size.c() = input_size.c();
|
||||
|
||||
cmd.get_cmd_line_argument("alpha_w", alpha.w());
|
||||
cmd.get_cmd_line_argument("alpha_x", alpha.x());
|
||||
cmd.get_cmd_line_argument("alpha_y", alpha.y());
|
||||
cmd.get_cmd_line_argument("alpha_z", alpha.z());
|
||||
|
||||
cmd.get_cmd_line_argument("beta_w", beta.w());
|
||||
cmd.get_cmd_line_argument("beta_x", beta.x());
|
||||
cmd.get_cmd_line_argument("beta_y", beta.y());
|
||||
cmd.get_cmd_line_argument("beta_z", beta.z());
|
||||
|
||||
cmd.get_cmd_line_argument("iterations", iterations);
|
||||
cmd.get_cmd_line_argument("tag", tag);
|
||||
|
||||
if (filter_size.h() == 3 && filter_size.w() == 3) {
|
||||
padding = {1, 1, 1, 1};
|
||||
}
|
||||
else {
|
||||
filter_size.h() = 1;
|
||||
filter_size.w() = 1;
|
||||
padding = {0, 0, 0, 0};
|
||||
}
|
||||
}
|
||||
|
||||
/// Prints the usage statement.
|
||||
std::ostream & print_usage(std::ostream &out) const {
|
||||
|
||||
out << "22_quaternion_conv example\n\n"
|
||||
<< " This example uses Ampere's Tensor Core operators on F16 data types to compute\n"
|
||||
<< " forward convolution on tensors of layout NHWC.\n\n"
|
||||
<< "Options:\n\n"
|
||||
<< " --help If specified, displays this usage statement.\n\n"
|
||||
<< " --n <int> Input tensor extent N\n"
|
||||
<< " --h <int> Input tensor extent H\n"
|
||||
<< " --w <int> Input tensor extent W\n"
|
||||
<< " --c <int> Input tensor extent C\n"
|
||||
<< " --k <int> Filter extent K\n"
|
||||
<< " --r <int> Filter extent R\n"
|
||||
<< " --s <int> Filter extent S\n\n"
|
||||
<< " --alpha <float> Epilogue scalar alpha\n"
|
||||
<< " --beta <float> Epilogue scalar beta\n\n"
|
||||
<< " --ref-check If set (true), reference check on the host is computed\n"
|
||||
<< " --perf-check If set (true), performance is measured.\n"
|
||||
<< " --benchmark If set (true), performance benchmarking on several layers and batch-size.\n"
|
||||
<< " --iterations <int> Number of profiling iterations to perform.\n"
|
||||
<< " --save-workspace If set, workspace is written to a text file.\n"
|
||||
<< " --tag <string> String to replicate across the first column in the results table\n";
|
||||
|
||||
out << "\n\nExamples:\n\n"
|
||||
<< "$ ./examples/22_quaternion_conv/22_quaternion_conv --n=32 --h=224 --w=224 --c=128 --k=256 --r=1 --s=1\n\n"
|
||||
<< "$ ./examples/22_quaternion_conv/22_quaternion_conv --n=1 --h=224 --w=224 --c=32 --k=32 --r=3 --s=3 --ref-check\n\n";
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/// Computes the output tensor size (NPQK)
|
||||
cutlass::Tensor4DCoord output_size() const {
|
||||
return cutlass::Tensor4DCoord(
|
||||
input_size.n(),
|
||||
(input_size.h() + padding.n() + padding.h() - filter_size.h()) / conv_stride.row() + 1,
|
||||
(input_size.w() + padding.w() + padding.c() - filter_size.w()) / conv_stride.column() + 1,
|
||||
filter_size.n());
|
||||
}
|
||||
|
||||
/// Compute performance in GFLOP/s
|
||||
double gflops(double runtime_s) const {
|
||||
|
||||
// Number of multiply-adds = NPQK * CRS
|
||||
int64_t fmas = output_size().product() * int64_t(filter_size.h() * filter_size.w() * filter_size.c()) * 16;
|
||||
|
||||
// Two flops per multiply-add
|
||||
return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct Result {
|
||||
double runtime_ms;
|
||||
double gflops;
|
||||
cutlass::Status status;
|
||||
cutlass::Status reference_check;
|
||||
cudaError_t error;
|
||||
|
||||
Result():
|
||||
runtime_ms(0),
|
||||
gflops(0),
|
||||
status(cutlass::Status::kSuccess),
|
||||
reference_check(cutlass::Status::kInvalid),
|
||||
error(cudaSuccess) { }
|
||||
|
||||
static std::ostream & print_header(std::ostream &out, Options const &options) {
|
||||
|
||||
if (!options.tag.empty()) {
|
||||
out << "Name,";
|
||||
}
|
||||
|
||||
out << "Layer,N,H,W,C,K,R,S,Runtime,GFLOPs";
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
std::ostream & print(std::ostream &out, int idx, Options const &options) {
|
||||
|
||||
if (!options.tag.empty()) {
|
||||
out << options.tag << ",";
|
||||
}
|
||||
|
||||
out
|
||||
<< "conv_" << idx << ","
|
||||
<< options.input_size.n() << ","
|
||||
<< options.input_size.h() << ","
|
||||
<< options.input_size.w() << ","
|
||||
<< options.input_size.c() << ","
|
||||
<< options.filter_size.n() << ","
|
||||
<< options.filter_size.h() << ","
|
||||
<< options.filter_size.w() << ","
|
||||
<< runtime_ms << ","
|
||||
<< gflops;
|
||||
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Runs one benchmark
|
||||
Result profile_convolution(Options const &options) {
|
||||
|
||||
Result result;
|
||||
|
||||
//
|
||||
// Allocate host-device tensors using the CUTLASS Utilities.
|
||||
//
|
||||
|
||||
cutlass::HostTensor<ElementInputA, LayoutInputA> tensor_a(options.input_size);
|
||||
cutlass::HostTensor<ElementInputB, LayoutInputB> tensor_b(options.filter_size);
|
||||
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_c(options.output_size());
|
||||
cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_c(options.output_size());
|
||||
|
||||
//
|
||||
// Initialize tensors
|
||||
//
|
||||
|
||||
// Fill tensor A on host with uniform-distribution random data
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
tensor_a.host_view(),
|
||||
1,
|
||||
7,
|
||||
-8,
|
||||
0);
|
||||
|
||||
// Fill tensor B on host with uniform-distribution random data
|
||||
cutlass::reference::host::TensorFillRandomUniform(
|
||||
tensor_b.host_view(),
|
||||
1,
|
||||
7,
|
||||
-8,
|
||||
0);
|
||||
|
||||
// Fill tensor C on host with zeros
|
||||
cutlass::reference::host::TensorFill(
|
||||
tensor_c.host_view());
|
||||
|
||||
// Fill tensor C for reference on host with zeros
|
||||
cutlass::reference::host::TensorFill(
|
||||
tensor_ref_c.host_view());
|
||||
|
||||
// Copy data from host to GPU
|
||||
tensor_a.sync_device();
|
||||
tensor_b.sync_device();
|
||||
tensor_c.sync_device();
|
||||
tensor_ref_c.sync_device();
|
||||
|
||||
//
|
||||
// Define arguments for CUTLASS Convolution
|
||||
//
|
||||
|
||||
cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
|
||||
|
||||
// Split K dimension into 1 partitions
|
||||
int split_k_slices = 1;
|
||||
|
||||
// Construct Conv2dProblemSize with user defined output size
|
||||
cutlass::conv::Conv2dProblemSize problem_size(
|
||||
options.input_size,
|
||||
options.filter_size,
|
||||
options.padding,
|
||||
options.conv_stride,
|
||||
options.dilation,
|
||||
options.output_size(),
|
||||
mode,
|
||||
split_k_slices
|
||||
);
|
||||
|
||||
// Construct ImplicitGemm::Argument structure with conv2d
|
||||
// problem size, data pointers, and epilogue values
|
||||
typename ImplicitGemm::Arguments arguments{
|
||||
problem_size,
|
||||
tensor_a.device_ref(),
|
||||
tensor_b.device_ref(),
|
||||
tensor_c.device_ref(),
|
||||
tensor_c.device_ref(),
|
||||
{options.alpha, options.beta},
|
||||
};
|
||||
|
||||
//
|
||||
// Initialize CUTLASS Convolution
|
||||
//
|
||||
|
||||
ImplicitGemm implicit_gemm_op;
|
||||
|
||||
size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
|
||||
|
||||
// Allocate workspace memory
|
||||
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
|
||||
|
||||
result.status = implicit_gemm_op.can_implement(arguments);
|
||||
CUTLASS_CHECK(result.status);
|
||||
|
||||
result.status = implicit_gemm_op.initialize(arguments, workspace.get());
|
||||
CUTLASS_CHECK(result.status);
|
||||
|
||||
//
|
||||
// Launch initialized CUTLASS kernel
|
||||
//
|
||||
result.status = implicit_gemm_op();
|
||||
|
||||
CUTLASS_CHECK(result.status);
|
||||
|
||||
//
|
||||
// Optional reference check
|
||||
//
|
||||
|
||||
if (options.reference_check) {
|
||||
std::cout << "Verification on host...\n";
|
||||
|
||||
// Compute with reference implementation
|
||||
cutlass::reference::host::Conv2dFprop<
|
||||
ElementInputA,
|
||||
LayoutInputA,
|
||||
ElementInputB,
|
||||
LayoutInputB,
|
||||
ElementOutput,
|
||||
LayoutOutput,
|
||||
ElementComputeEpilogue,
|
||||
ElementAccumulator,
|
||||
cutlass::NumericConverter<ElementOutput, ElementComputeEpilogue>
|
||||
>(
|
||||
problem_size,
|
||||
tensor_a.host_ref(),
|
||||
tensor_b.host_ref(),
|
||||
tensor_c.host_ref(),
|
||||
tensor_ref_c.host_ref(),
|
||||
options.alpha,
|
||||
options.beta
|
||||
);
|
||||
|
||||
// Check if output from CUTLASS kernel and reference kernel are equal or not
|
||||
tensor_c.sync_host();
|
||||
|
||||
bool passed = cutlass::reference::host::TensorEquals(
|
||||
tensor_c.host_view(),
|
||||
tensor_ref_c.host_view());
|
||||
|
||||
if (!passed) {
|
||||
result.reference_check = cutlass::Status::kErrorInternal;
|
||||
std::cout << "ERROR - results miscompared.\n";
|
||||
}
|
||||
else {
|
||||
result.reference_check = cutlass::Status::kSuccess;
|
||||
std::cout << "Passed.\n";
|
||||
}
|
||||
}
|
||||
else {
|
||||
result.reference_check = cutlass::Status::kInvalid;
|
||||
}
|
||||
|
||||
if (options.save_workspace) {
|
||||
|
||||
std::stringstream ss;
|
||||
|
||||
ss << "22_quaternion_conv_"
|
||||
<< options.input_size.n() << "x" << options.input_size.h() << "x" << options.input_size.w() << "x" << options.input_size.c()
|
||||
<< "_"
|
||||
<< options.filter_size.n() << "x" << options.filter_size.h() << "x" << options.filter_size.w() << "x" << options.filter_size.c()
|
||||
<< ".dat";
|
||||
|
||||
std::ofstream output_workspace(ss.str());
|
||||
|
||||
output_workspace
|
||||
<< "Input = \n" << tensor_a.host_view() << "\n\n"
|
||||
<< "Filters = \n" << tensor_b.host_view() << "\n\n";
|
||||
|
||||
if (options.reference_check) {
|
||||
output_workspace << "Reference = \n" << tensor_ref_c.host_view() << "\n\n";
|
||||
}
|
||||
|
||||
output_workspace << "Computed = \n" << tensor_c.host_view() << std::endl;
|
||||
|
||||
std::cout << "Results written to '" << ss.str() << "'." << std::endl;
|
||||
}
|
||||
|
||||
//
|
||||
// Performance measurement
|
||||
//
|
||||
|
||||
if (options.measure_performance) {
|
||||
|
||||
cudaEvent_t events[2];
|
||||
|
||||
for (auto & event : events) {
|
||||
result.error = cudaEventCreate(&event);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// Record an event at the start of a series of convolution operations.
|
||||
result.error = cudaEventRecord(events[0]);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Launch a sequence of implicit GEMM operations on the device
|
||||
for (int iteration = 0; iteration < options.iterations; ++iteration) {
|
||||
result.status = implicit_gemm_op();
|
||||
CUTLASS_CHECK(result.status);
|
||||
}
|
||||
|
||||
// Record an event when the convolutions have been launched.
|
||||
result.error = cudaEventRecord(events[1]);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Wait for work on the device to complete.
|
||||
result.error = cudaEventSynchronize(events[1]);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Measure elapsed runtime
|
||||
float runtime_ms = 0;
|
||||
result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
|
||||
if (result.error != cudaSuccess) {
|
||||
std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Print average runtime and GFLOPs.
|
||||
result.runtime_ms = double(runtime_ms) / double(options.iterations);
|
||||
result.gflops = options.gflops(result.runtime_ms / 1000.0);
|
||||
|
||||
// Cleanup
|
||||
for (auto event : events) {
|
||||
(void)cudaEventDestroy(event);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int main(int argc, char const **args) {
|
||||
|
||||
Options options;
|
||||
|
||||
options.parse(argc, args);
|
||||
|
||||
if (options.help) {
|
||||
options.print_usage(std::cout) << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (options.benchmark) {
|
||||
// Benchmark several layers
|
||||
|
||||
int batch_sizes[] = {1, 32, 64, 128, 256, 512};
|
||||
|
||||
struct Benchmark {
|
||||
int h, w, c, k, r, s;
|
||||
} layers[] = {
|
||||
{56, 56, 64, 256, 1, 1},
|
||||
{56, 56, 64, 64, 1, 1},
|
||||
{56, 56, 64, 64, 3, 3},
|
||||
{56, 56, 256, 64, 1, 1},
|
||||
{56, 56, 256, 512, 1, 1},
|
||||
{56, 56, 256, 128, 1, 1},
|
||||
{28, 28, 128, 128, 3, 3},
|
||||
{28, 28, 128, 512, 1, 1},
|
||||
{28, 28, 512, 128, 1, 1},
|
||||
{28, 28, 512, 1024, 1, 1},
|
||||
{28, 28, 512, 256, 1, 1},
|
||||
{14, 14, 256, 256, 3, 3},
|
||||
{14, 14, 256, 1024, 1, 1},
|
||||
{14, 14, 1024, 256, 1, 1},
|
||||
{14, 14, 1024, 2048, 1, 1},
|
||||
{14, 14, 1024, 512, 1, 1},
|
||||
{7, 7, 512, 512, 3, 3},
|
||||
};
|
||||
|
||||
Result::print_header(std::cout, options) << std::endl;
|
||||
|
||||
int idx = 1;
|
||||
|
||||
for (auto const &layer : layers) {
|
||||
for (auto N : batch_sizes) {
|
||||
|
||||
options.update({N, layer.h, layer.w, layer.c}, {layer.k, layer.r, layer.s, layer.c});
|
||||
|
||||
Result result = profile_convolution(options);
|
||||
result.print(std::cout, idx, options) << std::endl;
|
||||
}
|
||||
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
||||
// Execute one problem size
|
||||
if (!options.valid()) {
|
||||
std::cerr << "Invalid problem." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
Result result = profile_convolution(options);
|
||||
|
||||
Result::print_header(std::cout, options) << std::endl;
|
||||
result.print(std::cout, 1, options) << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -28,10 +28,14 @@ add_custom_target(test_examples)
|
||||
function(cutlass_example_add_executable NAME)
|
||||
|
||||
set(options)
|
||||
set(oneValueArgs)
|
||||
set(oneValueArgs DISABLE_TESTS)
|
||||
set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
|
||||
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
if (NOT DEFINED __DISABLE_TESTS)
|
||||
set(__DISABLE_TESTS OFF)
|
||||
endif()
|
||||
|
||||
cutlass_add_executable(${NAME} ${__UNPARSED_ARGUMENTS})
|
||||
|
||||
add_dependencies(cutlass_examples ${NAME})
|
||||
@ -60,6 +64,7 @@ function(cutlass_example_add_executable NAME)
|
||||
DEPENDEES test_examples ${__DEPENDEES}
|
||||
TEST_COMMAND_OPTIONS ${__TEST_COMMAND_OPTIONS}
|
||||
DISABLE_EXECUTABLE_INSTALL_RULE
|
||||
DISABLE_TESTS ${__DISABLE_TESTS}
|
||||
)
|
||||
|
||||
endfunction()
|
||||
@ -83,6 +88,11 @@ foreach(EXAMPLE
|
||||
15_ampere_sparse_tensorop_gemm
|
||||
16_ampere_tensorop_conv2dfprop
|
||||
17_fprop_per_channel_bias
|
||||
18_ampere_fp64_tensorop_affine2_gemm
|
||||
19_tensorop_canonical
|
||||
20_simt_canonical
|
||||
21_quaternion_gemm
|
||||
22_quaternion_conv
|
||||
)
|
||||
|
||||
add_subdirectory(${EXAMPLE})
|
||||
|
||||
Reference in New Issue
Block a user