CUTLASS 3.1 (#915)

Co-authored-by: Aniket Shivam <ashivam@nvidia.com>
This commit is contained in:
ANIKET SHIVAM
2023-04-14 20:19:34 -07:00
committed by GitHub
parent 9b8166e3f0
commit d572cc1aab
482 changed files with 37184 additions and 16419 deletions

View File

@ -25,7 +25,7 @@
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmake_policy(SET CMP0112 NEW)
include(GNUInstallDirs)
find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
@ -94,6 +94,9 @@ file(GLOB_RECURSE GENERATOR_PYTHON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOU
# set cutlass generator compiler version to filter kernels in the generator not supported by a specific toolkit.
set(CUTLASS_GENERATOR_CUDA_COMPILER_VERSION ${CMAKE_CUDA_COMPILER_VERSION})
# --log-level is set to DEBUG to enable printing information about which kernels were excluded
# from generation in /tools/library/scripts/manifest.py. To avoid having this information appear
# in ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log, set this parameter to INFO
execute_process(
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/scripts/generator.py
@ -112,6 +115,8 @@ execute_process(
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log
)
message(STATUS "Completed generation of library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log for more information.")
if(NOT cutlass_lib_INSTANCE_GENERATION_RESULT EQUAL 0)
message(FATAL_ERROR "Error generating library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log")
endif()

View File

@ -102,6 +102,12 @@ template <typename OperatorClass> struct ArchMap<arch::Sm90, OperatorClass> {
static int const kMax = 1024;
};
// Arch conditional WGMMA
template <> struct ArchMap<arch::Sm90, arch::OpClassTensorOp> {
static int const kMin = 90;
static int const kMax = 90;
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace library

View File

@ -178,7 +178,7 @@ public:
int K, /// GEMM K dimension
NumericTypeID element_compute, /// Data type of internal accumulation
NumericTypeID element_scalar, /// Data type of alpha/beta scalars
void const *alpha, /// Pointer to alpha scalar
@ -186,29 +186,29 @@ public:
NumericTypeID element_A, /// Data type of A matrix elements
LayoutTypeID layout_A, /// Layout of A matrix
ComplexTransform transform_A, /// Complex transformation applied to A matrix - ignored for real-valued matrices
void const * ptr_A, /// Pointer to A matrix in Global Memory
int64_t lda, /// Leading dimension of A matrix
int64_t lda, /// Leading dimension of A matrix
NumericTypeID element_B, /// Data type of B matrix elements
LayoutTypeID layout_B, /// Layout of B matrix
ComplexTransform transform_B, /// Complex transformation applied to B matrix - ignored for real-valued matrices
void const * ptr_B, /// Pointer to B matrix in Global Memory
int64_t ldb, /// Leading dimension of B matrix
int64_t ldb, /// Leading dimension of B matrix
void const * beta, /// Pointer to beta scalar
NumericTypeID element_C, /// Data type of C and D matrices
NumericTypeID element_C, /// Data type of C matrix
LayoutTypeID layout_C, /// Layout of D matrix
void const * ptr_C, /// Pointer to C matrix
int64_t ldc, /// Leading dimension of C matrix
int64_t ldc, /// Leading dimension of C matrix
NumericTypeID element_D, /// Data type of D matrix
LayoutTypeID layout_D, /// Layout of D matrix
void * ptr_D, /// Pointer to D matrix
int64_t ldd, /// Leading dimension of D matrix
int64_t ldd, /// Leading dimension of D matrix
int batch_count = 1, /// Batch count or number of split-K slices
int64_t batch_stride_A = 0, /// Batch stride of A operand
int64_t batch_stride_B = 0, /// Batch stride of B operand
int64_t batch_stride_C = 0, /// Batch stride of C operand

View File

@ -114,6 +114,8 @@ enum class NumericTypeID {
kS16,
kS32,
kS64,
kFE4M3,
kFE5M2,
kF16,
kBF16,
kTF32,
@ -474,9 +476,12 @@ struct GemmDescription : public OperationDescription {
/// Describes the B operand
TensorDescription B;
/// Describes the source and destination matrices
/// Describes the source matrix
TensorDescription C;
/// Describes the destination matrix
TensorDescription D;
/// Describes the sparse meta matrices
TensorDescription E;
@ -501,6 +506,7 @@ struct GemmDescription : public OperationDescription {
TensorDescription const &A = TensorDescription(),
TensorDescription const &B = TensorDescription(),
TensorDescription const &C = TensorDescription(),
TensorDescription const &D = TensorDescription(),
NumericTypeID element_epilogue = NumericTypeID::kInvalid,
SplitKMode split_k_mode = SplitKMode::kNone,
ComplexTransform transform_A = ComplexTransform::kNone,
@ -510,6 +516,7 @@ struct GemmDescription : public OperationDescription {
A(A),
B(B),
C(C),
D(D),
element_epilogue(element_epilogue),
split_k_mode(split_k_mode),
transform_A(transform_A),
@ -527,13 +534,14 @@ struct SparseGemmDescription : public GemmDescription {
TensorDescription const &A = TensorDescription(),
TensorDescription const &B = TensorDescription(),
TensorDescription const &C = TensorDescription(),
TensorDescription const &D = TensorDescription(),
TensorDescription const &E = TensorDescription(),
NumericTypeID element_epilogue = NumericTypeID::kInvalid,
SplitKMode split_k_mode = SplitKMode::kNone,
ComplexTransform transform_A = ComplexTransform::kNone,
ComplexTransform transform_B = ComplexTransform::kNone
):
GemmDescription(gemm_kind, A, B, C, element_epilogue, split_k_mode, transform_A, transform_B)
GemmDescription(gemm_kind, A, B, C, D, element_epilogue, split_k_mode, transform_A, transform_B)
{this->E = E;}
};
@ -1019,6 +1027,9 @@ struct GemmUniversalArguments {
int64_t batch_stride_B;
int64_t batch_stride_C;
int64_t batch_stride_D;
// Needed for some 3.x kernels
int sm_count;
};
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -66,6 +66,9 @@ struct GemmFunctionalKey {
LayoutTypeID layout_B;
ComplexTransform transform_B;
NumericTypeID element_C;
LayoutTypeID layout_C;
NumericTypeID element_D;
LayoutTypeID layout_D;
//
// Methods
@ -83,7 +86,10 @@ struct GemmFunctionalKey {
NumericTypeID element_B = NumericTypeID::kF16,
LayoutTypeID layout_B = LayoutTypeID::kColumnMajor,
ComplexTransform transform_B = ComplexTransform::kNone,
NumericTypeID element_C = NumericTypeID::kF16
NumericTypeID element_C = NumericTypeID::kF16,
LayoutTypeID layout_C = LayoutTypeID::kColumnMajor,
NumericTypeID element_D = NumericTypeID::kF16,
LayoutTypeID layout_D = LayoutTypeID::kColumnMajor
):
provider(provider),
gemm_kind(gemm_kind),
@ -95,7 +101,10 @@ struct GemmFunctionalKey {
element_B(element_B),
layout_B(layout_B),
transform_B(transform_B),
element_C(element_C)
element_C(element_C),
layout_C(layout_C),
element_D(element_D),
layout_D(layout_D)
{ }
inline
@ -111,7 +120,10 @@ struct GemmFunctionalKey {
(element_B == rhs.element_B) &&
(layout_B == rhs.layout_B) &&
(transform_B == rhs.transform_B) &&
(element_C == rhs.element_C);
(element_C == rhs.element_C) &&
(layout_C == rhs.layout_C) &&
(element_D == rhs.element_D) &&
(layout_D == rhs.layout_D);
}
inline
@ -137,6 +149,9 @@ std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey
<< " layout_B: " << to_string(k.layout_B) << "\n"
<< " transform_B: " << to_string(k.transform_B) << "\n"
<< " element_C: " << to_string(k.element_C) << "\n"
<< " layout_C: " << to_string(k.layout_C) << "\n"
<< " element_D: " << to_string(k.element_D) << "\n"
<< " layout_D: " << to_string(k.layout_D) << "\n"
<< "}";
return out;
@ -157,18 +172,21 @@ struct GemmFunctionalKeyHasher {
size_t operator()(GemmFunctionalKey const &key) const {
IntHash hash;
return
rotl(hash(int(key.provider)), 1) ^
rotl(hash(int(key.gemm_kind)), 2) ^
return
rotl(hash(int(key.provider)), 1) ^
rotl(hash(int(key.gemm_kind)), 2) ^
rotl(hash(int(key.element_compute)), 3) ^
rotl(hash(int(key.element_scalar)), 4) ^
rotl(hash(int(key.element_A)), 5) ^
rotl(hash(int(key.layout_A)), 6) ^
rotl(hash(int(key.transform_A)), 7) ^
rotl(hash(int(key.element_B)), 8) ^
rotl(hash(int(key.layout_B)), 9) ^
rotl(hash(int(key.transform_B)), 10) ^
rotl(hash(int(key.element_C)), 11);
rotl(hash(int(key.element_scalar)), 4) ^
rotl(hash(int(key.element_A)), 5) ^
rotl(hash(int(key.layout_A)), 6) ^
rotl(hash(int(key.transform_A)), 7) ^
rotl(hash(int(key.element_B)), 8) ^
rotl(hash(int(key.layout_B)), 9) ^
rotl(hash(int(key.transform_B)), 10) ^
rotl(hash(int(key.element_C)), 11) ^
rotl(hash(int(key.layout_C)), 12) ^
rotl(hash(int(key.element_D)), 13) ^
rotl(hash(int(key.layout_D)), 14);
}
};

View File

@ -23,7 +23,8 @@ from library import *
class GemmOperation:
#
def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8):
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, D = None,
kernel_schedule = KernelScheduleType.ScheduleAuto, epilogue_schedule = EpilogueScheduleType.ScheduleAuto):
self.prefix = "3x" if gemm_kind == GemmKind.Universal3x else ""
self.operation_kind = OperationKind.Gemm
@ -33,6 +34,15 @@ class GemmOperation:
self.A = A
self.B = B
self.C = C
self.D = D
if self.D == None:
self.D = self.C
if gemm_kind != GemmKind.Universal3x:
assert(kernel_schedule == KernelScheduleType.ScheduleAuto)
assert(epilogue_schedule == EpilogueScheduleType.ScheduleAuto)
self.kernel_schedule = kernel_schedule
self.epilogue_schedule = epilogue_schedule
self.element_epilogue = element_epilogue
self.epilogue_functor = epilogue_functor
self.swizzling_functor = swizzling_functor
@ -122,11 +132,12 @@ class GemmOperation:
def extended_name_3x(self):
'''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}".format(
extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
element_a = DataTypeNames[self.A.element],
element_b = DataTypeNames[self.B.element],
element_acc = DataTypeNames[self.tile_description.math_instruction.element_accumulator],
element_c = DataTypeNames[self.C.element],
element_d = DataTypeNames[self.D.element],
core_name = self.core_name())
return extended_name
@ -152,12 +163,20 @@ class GemmOperation:
ShortLayoutTypeNames[self.B.layout],
ShortLayoutTypeNames[self.C.layout])
# Generates a short string representing underlying kernel schedule type
def kernel_schedule_name_3x(self):
return KernelScheduleSuffixes[self.kernel_schedule]
# Generates a short string representing underlying epilogue schedule type
def epilogue_schedule_name_3x(self):
return EpilogueScheduleSuffixes[self.epilogue_schedule]
# Generates the full kernel function name
def procedural_name(self):
''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
if self.arch >= 90:
kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{l}_{s}_align{al}"
kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{l}_{s}_align{al}{k}{e}"
return kernel_name_template.format(
p = self.prefix,
ar = self.arch,
@ -171,7 +190,9 @@ class GemmOperation:
ck = self.tile_description.cluster_shape[2],
l = self.tile_description.stages,
s = self.layout_name_3x(),
al = str(max(self.A.alignment, self.B.alignment)))
al = str(max(self.A.alignment, self.B.alignment)),
k = self.kernel_schedule_name_3x(),
e = self.epilogue_schedule_name_3x())
else:
threadblock = self.tile_description.procedural_name()
return "cutlass{p}_{op}_{ex}_{tb}_{l}_align{a}".format(
@ -604,8 +625,7 @@ class EmitGemmUniversal3xInstance:
"cutlass/numeric_types.h",
"cutlass/gemm/kernel/gemm_universal.hpp",
"cutlass/gemm/collective/collective_builder.hpp",
"cutlass/epilogue/collective/default_epilogue.hpp",
"cutlass/epilogue/thread/linear_combination.h",
"cutlass/epilogue/collective/collective_builder.hpp",
]
self.builtin_epilogue_functor_template = """
${epilogue_functor}<
@ -617,6 +637,18 @@ class EmitGemmUniversal3xInstance:
"""
self.gemm_template = """
using ${operation_name}_epilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
${arch}, ${opcode_class},
cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
cutlass::epilogue::collective::EpilogueTileAuto,
${element_accumulator}, ${element_epilogue},
${element_c}, ${layout_c}, ${align_c},
${element_d}, ${layout_d}, ${align_d},
${epilogue_schedule}
>::CollectiveOp;
using ${operation_name}_mainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
${arch}, ${opcode_class},
@ -625,18 +657,11 @@ using ${operation_name}_mainloop =
${element_accumulator},
cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
cutlass::gemm::collective::StageCountAuto,
cutlass::gemm::collective::KernelScheduleAuto
cutlass::gemm::collective::StageCountAutoCarveout<
sizeof(typename ${operation_name}_epilogue::SharedStorage)>,
${kernel_schedule}
>::CollectiveOp;
using ${operation_name}_epilogue =
cutlass::epilogue::collective::DefaultEpilogue<
cutlass::gemm::TagToStrideC_t<${layout_c}>,
cutlass::gemm::TagToStrideC_t<${layout_c}>,
cutlass::epilogue::thread::LinearCombination<
${element_c}, ${epilogue_vector_length}, ${element_accumulator}, ${element_epilogue}>
>;
// Gemm operator ${operation_name}
using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int,int,int,int>,
@ -670,8 +695,8 @@ ${compile_guard_end}
stage_count_string = "cutlass::gemm::collective::StageCountAuto"
warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
instance_layout_A, instance_layout_B, instance_layout_C = \
(operation.A.layout, operation.B.layout, operation.C.layout)
instance_layout_A, instance_layout_B, instance_layout_C , instance_layout_D = \
(operation.A.layout, operation.B.layout, operation.C.layout, operation.D.layout)
# 3.0 profiler integration only supports trivial epilogues for now
epilogue_vector_length = 1
@ -697,6 +722,8 @@ ${compile_guard_end}
'layout_b': LayoutTag[instance_layout_B],
'element_c': DataTypeTag[operation.C.element],
'layout_c': LayoutTag[instance_layout_C],
'element_d': DataTypeTag[operation.D.element],
'layout_d': LayoutTag[instance_layout_D],
'element_accumulator': DataTypeTag[operation.accumulator_type()],
'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
'arch': "cutlass::arch::Sm%d" % operation.arch,
@ -712,10 +739,14 @@ ${compile_guard_end}
'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
'kernel_schedule' : str(KernelScheduleTag[operation.kernel_schedule]),
'epilogue_schedule' : str(EpilogueScheduleTag[operation.epilogue_schedule]),
'epilogue_functor': epilogue_functor,
'stages': stage_count_string,
'align_a': str(operation.A.alignment),
'align_b': str(operation.B.alignment),
'align_c': str(operation.C.alignment),
'align_d': str(operation.C.alignment),
'transform_a': ComplexTransformTag[operation.A.complex_transform],
'transform_b': ComplexTransformTag[operation.B.complex_transform],
'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],

File diff suppressed because it is too large Load Diff

View File

@ -361,6 +361,58 @@ ShortComplexLayoutNames = {
(LayoutType.RowMajor, ComplexTransform.conj): 'h'
}
###################################################################################################
class KernelScheduleType(enum.Enum):
ScheduleAuto = enum_auto()
Multistage = enum_auto()
Tma = enum_auto()
TmaWarpSpecialized = enum_auto()
TmaWarpSpecializedPingpong = enum_auto()
TmaWarpSpecializedCooperative = enum_auto()
#
KernelScheduleTag = {
KernelScheduleType.ScheduleAuto: 'cutlass::gemm::collective::KernelScheduleAuto',
KernelScheduleType.Multistage: 'cutlass::gemm::KernelMultistage',
KernelScheduleType.Tma: 'cutlass::gemm::KernelTma',
KernelScheduleType.TmaWarpSpecialized: 'cutlass::gemm::KernelTmaWarpSpecialized',
KernelScheduleType.TmaWarpSpecializedPingpong: 'cutlass::gemm::KernelTmaWarpSpecializedPingpong',
KernelScheduleType.TmaWarpSpecializedCooperative: 'cutlass::gemm::KernelTmaWarpSpecializedCooperative',
}
#
KernelScheduleSuffixes = {
KernelScheduleType.ScheduleAuto: '',
KernelScheduleType.Multistage: '_cpasync',
KernelScheduleType.Tma: '_unspecialized',
KernelScheduleType.TmaWarpSpecialized: '_warpspecialized',
KernelScheduleType.TmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
KernelScheduleType.TmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
}
class EpilogueScheduleType(enum.Enum):
ScheduleAuto = enum_auto()
EpilogueTransposed = enum_auto()
NoSmemWarpSpecialized = enum_auto()
TmaWarpSpecialized = enum_auto()
TmaWarpSpecializedCooperative = enum_auto()
#
EpilogueScheduleTag = {
EpilogueScheduleType.ScheduleAuto: 'cutlass::epilogue::collective::EpilogueScheduleAuto',
EpilogueScheduleType.EpilogueTransposed: 'cutlass::gemm::EpilogueTransposed',
EpilogueScheduleType.NoSmemWarpSpecialized: 'cutlass::epilogue::NoSmemWarpSpecialized',
EpilogueScheduleType.TmaWarpSpecialized: 'cutlass::epilogue::TmaWarpSpecialized',
EpilogueScheduleType.TmaWarpSpecializedCooperative: 'cutlass::epilogue::TmaWarpSpecializedCooperative',
}
#
EpilogueScheduleSuffixes = {
EpilogueScheduleType.ScheduleAuto: '',
EpilogueScheduleType.EpilogueTransposed: '',
EpilogueScheduleType.NoSmemWarpSpecialized: '_epi_nosmem',
EpilogueScheduleType.TmaWarpSpecialized: '_epi_tma',
EpilogueScheduleType.TmaWarpSpecializedCooperative: '_epi_tma',
}
###################################################################################################
#

View File

@ -1,143 +0,0 @@
# PyCUTLASS: CUTLASS Python Interface
PyCUTLASS is a python interface of CUTLASS C++ template library. PyCUTLASS takes user-defined operation descriptions, emits C++ code, and compiles it with `nvcc` or `nvrtc`. It also provides wrappers for user-provide arguments from [numpy](https://numpy.org/), [torch](https://pytorch.org/), and [cupy](https://github.com/cupy/cupy) and encode them to kernel's parameters.
```python
import pycutlass
from pycutlass import *
import torch
pycutlass.get_memory_pool(2**8, 2**32)
math_inst = MathInstruction(
[1, 1, 1], cutlass.float32, cutlass.float32, cutlass.float32,
cutlass.OpClass.Simt, MathOperation.multiply_add
)
tile_description = TileDescription(
[128, 128, 8], 4, [2, 4, 1],
math_inst
)
A = TensorDescription(
cutlass.float32, cutlass.RowMajor, 1
)
B = TensorDescription(
cutlass.float32, cutlass.RowMajor, 1
)
C = TensorDescription(
cutlass.float32, cutlass.RowMajor, 1
)
epilogue_functor = LinearCombination(cutlass.float32, 1, cutlass.float32, cutlass.float32)
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
pycutlass.compiler.add_module([operation,])
problem_size = cutlass.gemm.GemmCoord(512, 256, 128)
tensor_A = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.k()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
tensor_B = torch.ceil(torch.empty(size=(problem_size.k(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
tensor_C = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
tensor_D = torch.empty_like(tensor_C)
alpha = 1.0
beta = 0.0
arguments = GemmArguments(
operation=operation, problem_size=problem_size,
A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
output_op=operation.epilogue_type(alpha, beta),
gemm_mode=cutlass.gemm.Mode.Gemm, split_k_splices=1
)
operation.run(arguments)
arguments.sync()
tensor_D_ref = alpha * tensor_A @ tensor_B + beta * tensor_C
assert torch.equal(tensor_D, tensor_D_ref)
```
PyCUTLASS also provides infrastructures for profiling, compiled artifact management, and pool memory manager
## Supported Features
PyCUTLASS currently supports following operations:
* GEMM with mode {Serial, Parallel Split K, Batched GEMM, Array GEMM}, op class {SIMT, TensorCore}, data type {int8, f16, bf16, f32, f64}, layout {RowMajor, ColumnMajor, Row/ColumnMajorInterleaved<32> for int8}, math operation {MultiplyAdd, MultiplyAddFastF16, MultiplyAddFastBF16, MultiplyAddFastF32}, swizzling functions {IdentitySwizzle<1,2,4,8>, HorizontalSwizzle, BatchedIdentitySwizzle}, and epilogue {LinearCombination, LinearCombinationClamp}
* GEMM grouped with op class {SIMT, TensorCore}, data type {int8, f16, bf16, f32, f64}, layout {RowMajor, ColumnMajor}, math operation {MultiplyAdd, MultiplyAddFastF16, MultiplyAddFastBF16, MultiplyAddFastF32}, scheduling mode {Host, Device}, and epilogue {LinearCombination, LinearCombinationClamp}.
* Conv2d with {Fprop, Dgrad, Wgrad}, op class {SIMT, TensorCore}, data type {int8, f16, bf16, f32, f64}, layout {Tensor NHWC, TensorNC32HW32 and TensorC32RSK for int8}, math operation {MultiplyAdd, MultiplyAddFastF16, MultiplyAddFastBF16, MultiplyAddFastF32}, split-k mode {Parallel, Serial}, and epilogue {LinearCombination, LinearCombinationClamp}
The tiling size of above operations can also be customized.
## Installation
### Using Docker
We recommend using one of our provided Docker images for using PyCUTLASS.
**To run CUTLASS 3 GEMM kernels targeting the NVIDIA Hopper architecture via PyCUTLASS,** you can use an included [Dockerfile](docker/Dockerfile-cuda12.0) based on the NGC CUDA 12.0 container:
```shell
docker build -t pycutlass-cuda12.0:latest -f docker/Dockerfile-cuda12.0 .
docker run --gpus all -it --rm pycutlass-cuda12.0:latest
```
Note that this Docker container does not include CuPy or PyTorch, and, thus, will not be able to run PyCUTLASS examples that
leverage these packages.
**To run CUTLASS 2.x kernels targeting pre-SM90 architectures via PyCUTLASS,** you can use an included [Dockerfile](docker/Dockerfile-cuda11.8-pytorch) based on an NGC PyTorch container:
```shell
docker build -t pycutlass-cuda11.8-pytorch:latest -f docker/Dockerfile-cuda11.8-pytorch .
docker run --gpus all -it --rm pycutlass-cuda11.8-pytorch:latest
```
### Environment variables
PyCUTLASS requires two environment variables:
* `CUTLASS_PATH`: the root directory of CUTLASS. You can set this from the location at which you cloned CUTLASS via: `export CUTLASS_PATH=$(pwd)`.
* `CUDA_INSTALL_PATH`: the directory where cuda toolkit is installed. If running in bash with `nvcc` installed under a CUDA toolkit, you can set this to the location of your `nvcc` installation via: `export CUDA_INSTALL_PATH=$(which nvcc | awk -F'/bin/nvcc' '{print $1}')`
After setting these two environment variables, PyCUTLASS can be installed with
```shell
cd $CUTLASS_PATH/tools/library/scripts/pycutlass && bash build.sh
```
## Examples
Examples can be found in [$CUTLASS_PATH/examples/40_cutlass_py](examples/40_cutlass_py)
## Test
The test cases are listed in `$CUTLASS_PATH//tools/library/scripts/pycutlass/test`. The unit test can be run with
```shell
# Each of these tests are only supported on devices with compute capability of SM80. For other devices,
# see the basic examples in $CUTLASS_PATH/examples/40_cutlass_py
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/test/unit && python test_sm80.py
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/test/example && bash run_all_example.sh
```
## build documentation
Run
```shell
bash build_doc.sh
```
## Troubleshooting
### Issue 1: permission denied
Building PyCUTLASS requires installing dependencies to python. So conda could an option if you don't have permission.
### Issue 2: rmm: module not found
PyCUTLASS manages the device memory with [RMM](https://github.com/rapidsai/rmm). Our `build.sh` automatically pull the [rmm branch-22.08](https://github.com/rapidsai/rmm/tree/branch-22.08) from github and build it from source. The rmm is allocated at `$CUTLASS_PATH/tools/library/scripts/pycutlass/rmm`. It requires `cmake > 3.20.1`. If the build fails, it can be manually fixed with the following steps:
```shell
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/rmm && ./build.sh librmm rmm
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/rmm/python
python setup.py build_ext --inplace
python setup.py install
```
To test whether rmm is successfully installed, try `import rmm`. For other issues related to rmm, please check https://github.com/rapidsai/rmm/issues.

View File

@ -1,36 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
pip install -U pybind11
git clone https://github.com/google/googletest.git
python setup.py develop --user
python setup.py rmm

View File

@ -1,36 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
pip install enum-tools
pip install sphinx-toolbox
pip install m2r2
sphinx-build -b html docs/source/ docs/build/html

View File

@ -1,40 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
FROM nvcr.io/nvidia/pytorch:22.11-py3
RUN chmod ugo+rwx /home
RUN pip uninstall -y rmm
RUN pip install rmm-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
ENV LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
ENV CUDA_INSTALL_PATH=/usr/local/cuda

View File

@ -1,46 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
FROM nvcr.io/nvidia/cuda:12.0.0-devel-ubuntu20.04
RUN apt-get update
RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
RUN apt-get install -y git cmake vim python3 python3-pip
RUN ln -s /usr/bin/python3 /usr/bin/python
RUN chmod ugo+rwx /home
RUN pip install numpy==1.23
RUN pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
RUN pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
RUN pip install cugraph-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu/:$LIBRARY_PATH
ENV CUDA_INSTALL_PATH=/usr/local/cuda

View File

@ -1,52 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

View File

@ -1,35 +0,0 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.https://www.sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

View File

@ -1,96 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = 'PyCutlass'
copyright = '2022, Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall'
author = 'Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.duration',
'sphinx.ext.doctest',
'sphinx.ext.autodoc',
'sphinx.ext.intersphinx',
'enum_tools.autoenum',
'sphinx.ext.autosummary',
'm2r2'
]
source_suffix = [".rst", ".md"]
autosummary_generate = True
autosummary_imported_members = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'bizstyle'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']

View File

@ -1,13 +0,0 @@
CONV2D Operation
================
.. autoclass:: pycutlass.Conv2dOperation
:special-members:
:members: run
:exclude-members: __weakref__, configuration_name, core_name, extended_name, procedural_name
.. autoclass:: pycutlass.Conv2dArguments
:special-members:
:members:
:exclude-members: initialize
:show-inheritance:

View File

@ -1,100 +0,0 @@
cutlass
=======
.. rubric:: Operator Classification
.. autoclass:: cutlass.OpClass
:members:
.. rubric:: GEMM Layout
.. autoclass:: cutlass.RowMajor
:members:
.. autoclass:: cutlass.ColumnMajor
:members:
.. autoclass:: cutlass.RowMajorInterleaved32
:members:
.. autoclass:: cutlass.ColumnMajorInterleaved32
:members:
.. rubric:: Conv Layout
.. autoclass:: cutlass.TensorNHWC
:members:
.. autoclass:: cutlass.TensorNC32HW32
:members:
.. autoclass:: cutlass.TensorC32RSK32
:members:
.. rubric:: Threadblock Swizzle
.. autoclass:: cutlass.dim3
:special-members:
:members:
.. autoclass:: cutlass.IdentitySwizzle1
:special-members:
:members:
.. autoclass:: cutlass.IdentitySwizzle2
:special-members:
:members:
.. autoclass:: cutlass.IdentitySwizzle4
:special-members:
:members:
.. autoclass:: cutlass.IdentitySwizzle8
:special-members:
:members:
.. autoclass:: cutlass.HorizontalSwizzle
:special-members:
:members:
.. autoclass:: cutlass.BatchedIdentitySwizzle
:special-members:
:members:
.. autoclass:: cutlass.StridedDgradIdentitySwizzle1
:special-members:
:members:
.. autoclass:: cutlass.StridedDgradIdentitySwizzle4
:special-members:
:members:
.. autoclass:: cutlass.StridedDgradHorizontalSwizzle
:special-members:
:members:
.. rubric:: Coordinates
.. autoclass:: cutlass.Tensor4DCoord
:special-members:
:members:
.. autoclass:: cutlass.Tensor3DCoord
:special-members:
:members:
.. autoclass:: cutlass.MatrixCoord
:special-members:
:members:
.. rubric:: Convolution
.. autoclass:: cutlass.conv.Operator
:members:
.. autoclass:: cutlass.conv.IteratorAlgorithm
:members:
.. autoclass:: cutlass.conv.StrideSupport
:members:

View File

@ -1,18 +0,0 @@
GEMM Operation
==============
.. autoclass:: pycutlass.GemmOperationUniversal
:special-members:
:members:
.. autoclass:: pycutlass.GemmOperationGrouped
:special-members:
:members:
.. autoclass:: pycutlass.GemmArguments
:special-members:
:members:
.. autoclass:: pycutlass.GemmGroupedArguments
:special-members:
:members:

View File

@ -1,31 +0,0 @@
.. PyCutlass documentation master file, created by
sphinx-quickstart on Sun Jun 19 12:05:42 2022.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
CUTLASS Python Project Documentation
=====================================
.. mdinclude:: ../../README.md
.. toctree::
:maxdepth: 2
:caption: Contents:
.. Indices and tables
.. ==================
.. * :ref:`genindex`
.. * :ref:`modindex`
.. * :ref:`search`
Indices
==================
.. toctree::
user_guide
visitor_tree
gemm_op
conv2d_op
cutlass

View File

@ -1,225 +0,0 @@
# Epilogue Visitor Tree
The Epilogue Visitor Tree is an experimental feature that directly generates epilogues from user-provide python functions.
## Usage
The Epilogue Visitor tree support many different operations.
### Unary functions
Epilogue Visitor Tree supports unary functions like activation functions. For example,
```python
class UnaryEpilogue_(EpilogueVisitTree):
def __call__(
self, accum: 'tensor', c: 'tensor',
alpha: 'scalar', beta: 'scalar'):
#
T = leaky_relu.numpy(accum, 0.2)
Z = alpha * T + beta * c
return Z
epilogue_functor = UnaryEpilogue_(
epilogue_functor, tile_description, math_inst.element_accumulator,
C.alignment, element_epilogue, C.element)
```
### Broadcast Operation
Epilogue Visitor Tree supports broadcasting row and column vectors to the whole output matrix. To use broadcast, you just need to specify whether the source vector is a `row` vector or a `column` vector. Here is an example.
```python
class ColumnBroadcast_(EpilogueVisitTree):
def __call__(
self, accum: 'tensor', c: 'tensor',
vector: 'column', alpha: 'scalar', beta: 'scalar'):
#
T = accum + vector
scale_T = leaky_relu.numpy(alpha * T, 0.2)
Z = scale_T + beta * c
return Z, T
epilogue_functor = ColumnBroadcast_(
epilogue_functor, tile_description, math_inst.element_accumulator,
C.alignment, element_epilogue, C.element)
```
### Reduction Operation
Epilogue Visitor Tree also supports row and column-wise reduction in each threadblock tile. The syntax for reduction is
```python
{reduction_output} = reduction_op({input_tensor}, {row|column}, {Add}, {threadblock_shape.n|threadblock_shape.m})
```
The `{row|column}` indicates whether the `row` vectors are reduced or the `column` vectors are reduction. The `{Add}` specifies the reduction operation. The `{threadblock_shape.n|threadblock_shape.m}` are the reduction lengths.
**Constraint**
* The `{input_tensor}` can only be the name of source or intermediate result. `reduction_op(A + B, ...)` will not work, please use `C = A + B`, `reduction_op(C, ...)` instead.
* The `{reduction_output}` cannot be used in the epilogue. It will be directly written to global memory after the reduction is done.
```python
class RowReduction_(EpilogueVisitTree):
def __call__(
self, accum: 'tensor', c: 'tensor',
alpha: 'scalar', beta: 'scalar'):
#
D = alpha * accum + tanh.numpy(beta * c)
reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
return D, reduction
epilogue_functor = RowReduction_(
epilogue_functor, tile_description, math_inst.element_accumulator,
C.alignment, element_epilogue, C.element)
epilogue_functor.initialize()
```
## Get output_op
As shown in the user guide, an `output_op` is required by the argument wrapper. We will take the `RowReduction_` as an example to show how to get `output_op`.
```python
class RowReduction_(EpilogueVisitTree):
def __call__(
self, accum: 'tensor', c: 'tensor',
alpha: 'scalar', beta: 'scalar'):
#
D = alpha * accum + tanh.numpy(beta * c)
reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
return D, reduction
epilogue_functor = RowReduction_(
epilogue_functor, tile_description, math_inst.element_accumulator,
C.alignment, element_epilogue, C.element)
epilogue_functor.initialize()
cta_n = args.threadblock_shape[1]
num_cta_n = (problem_size.n() + cta_n - 1) // cta_n
reduction = np.zeros(shape=(args.batch * problem_size.m() * num_cta_n,), dtype=getattr(np, element_c))
# get output op
output_op = operation.epilogue_type(
D=tensor_D, alpha=args.alpha, beta=args.beta, c=tensor_C, reduction=reduction, problem_size=[problem_size.m(), problem_size.n()]
)
```
Like other epilogue functors such as `LinearCombination`, the output op for EpilogueVisitorTree is also created with `operation.epilogue_type(*)`. However, there are two differences:
* The arguments need to be passed as keyword-arguments. The keywords are the argument names in `def __call__`.
* An additional `problem_size=[problem_size.m(), problem_size.n()]` is required.
## Add new Unary Operation (e.g. Activation Function)
To add additional unary operation into epilogue visitor tree, a new unary op
should be created for `VisitorOpUnary`. We will take `tanh` as an example.
### Step 1: define TanhVisitor
The visitor defines the parameters and computation required by the unary option.
The unary operations are registered in [pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h](tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h). But you can define it in any header file and include the header file in [pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h](tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h).
* Two template arguments are required:
* `T`: data type used to compute the unary operation
* `N`: compute fragment length
* We also need to provide the `Arguments` and `Params` structures. The `Arguments` will be assembled by [ctypes](https://docs.python.org/3/library/ctypes.html), the `Params` will be generated from `Arguments` automatically. If the unary function takes no argument, an integer like `int tmp` can be provide to ensure the correctness of ctypes.
* The constructor can only take the `params` as the single argument.
* The operation is defined in `Array<T, N> operator()(Array<T, N> const &frag) const `. On common way to do that is first define a scalar computation, and them use it for the fragment computation with an unrolled for-loop.
* A guard function is required. If it returns `true`, it will disable all the children nodes of the unary node and return zeros to parent node. This is very helpful for multiplication with scalar while the scalar is `0`. For general cases, you can just return `true`.
```c++
// T: data type used to compute the unary operation
// N: compute fragment length
template <typename T, int N>
struct TanhVisitor {
/// Argument
struct Arguments {
// a placeholder argument to ensure correctness of ctypes
int tmp;
CUTLASS_HOST_DEVICE
Arguments(): tmp(0) { };
CUTLASS_HOST_DEVICE
Arguments(int tmp): tmp(tmp) { };
};
/// Param
struct Params {
CUTLASS_HOST_DEVICE
Params(){ };
Params(Arguments const &args) { }
};
/// Constructor
CUTLASS_HOST_DEVICE
TanhVisitor(Params const &params) { }
// scalar operator
CUTLASS_HOST_DEVICE
T tanh_op(T const &scalar) const {
return fast_tanh(scalar);
}
/// vector operator
CUTLASS_HOST_DEVICE
Array<T, N> operator()(Array<T, N> const &frag) const {
Array<T, N> y;
CUTLASS_PRAGMA_UNROLL
for (int i=0; i < N; ++i) {
y[i] = tanh_op(frag[i]);
}
return y;
}
// Guard
CUTLASS_HOST_DEVICE
bool guard() {
return true;
}
};
```
### Step 2: register Tanh function
After defining the function in C++, we need to register it in python. The class below gives an example.
* The init function takes the data type `element_compute`, which will be the `T` in the C++ template.
In the init function, we also generate the `_Arguments` class as a `ctypes.Structure`. It includes all the data members in the `TanhVisitor::Arguments`.
* The `_Arguments` need to be registered as `self.argument_type` of `tanh` class.
* A `emit` function is required to emit the namespace and typename of `TanhVisitor`.
* A staticmethod as numpy reference is required to implement the python code to parse.
The built-in functions are defined in [pycutlass/src/pycutlass/epilogue.py](tools/library/scripts/pycutlass/src/pycutlass/epilogue.py). You can defined yours in any file as long as it can be found by [/pycutlass/src/pycutlass/parser.py](tools/library/scripts/pycutlass/src/pycutlass/parser.py).
```python
class tanh(ActivationFunctor):
def __init__(self, element_compute) -> None:
super().__init__()
class _Arguments(ctypes.Structure):
_fields_ = [
("tmp", ctypes.c_int)
]
def __init__(self, *args) -> None:
self.tmp = 0
self.argument_type = _Arguments
def emit(self):
return "cutlass::TanhVisitor"
@staticmethod
def numpy(x: np.ndarray):
return np.tanh(x)
```
### Step 3: Run the function
Now the new unary op is ready to use. An epilogue visitor tree can be built with
```python
class RowReduction_(EpilogueVisitTree):
def __call__(
self, accum: NDArray['tensor', 'float32'], c: NDArray['tensor', 'float32'],
alpha: 'float32', beta: 'float32'):
#
D = alpha * accum + tanh.numpy(beta * c)
reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
return D, reduction
epilogue_functor = RowReduction_(
epilogue_functor, tile_description, math_inst.element_accumulator,
C.alignment, element_epilogue, C.element)
epilogue_functor.initialize()
```
## Limitations and Future work
Although the Epilogue Visitor Tree brings great flexibility to epilogue construction, as the epilogue is formulated as a single tree, there are several limitations.
* [Future Work] Serial and Parallel Split-K GEMM are not supported yet.
* To support serial split-k, additional tree transformation pass is required to inject a `binaryOpNode(Add)` + `TensorInputNode` before each `TensorOutputNode` to fetch the partial sum back. The `semaphore` also needs to be passed into epilogue.
* To support parallel split-k, an Reduction with visitor kernel is required.
* [Future Work] Convolution and GEMM Grouped are not supported yet.
* To support Conv2d and GEMM Grouped, corresponding *_with_visitor kernels are required.
* [Limitation] If the same node is used by two operations (except that one of them is reduction), the node and all its offsprings will be executed twice.
* [Limitation] The result of reduction can only be used as the return value.

View File

@ -1,283 +0,0 @@
# Basics of PyCUTLASS
PyCUTLASS handles the following things when launch the CUTLASS kernels
* Memory management
* Operation Description
* Code emission and compilation
* Arguments preprocessing
* Kernel launching
* Result Synchronization
## Memory management
PyCUTLASS uses [RMM](https://github.com/rapidsai/rmm) to manage device memory. At the beginning of the program, call
```python
pycutlass.get_memory_pool({init_pool_size_in_bytes}, {max_pool_size_in_bytes})
```
We also provide functions to query the allocated size.
```python
bytes = get_allocated_size()
```
## Operation Description
PyCUTLASS provides operation description for GEMM, GEMM Grouped and Conv2d operations. These operation descriptions are assembled from four foundamental concepts
* Math Instruction: math instruction executed in GPU cores
* Tile Description: tiling sizes and pipeline stages
* Operand Description: data type, layout, memory alignment
* Epilogue Functor: epilogue function
### Math Instruction
The math instruction is defined as follows:
```python
math_inst = MathInstruction(
{instruction_shape}, {element_a}, {element_b},
{element_acc}, {opclass}, {math_operation}
)
```
The `{instruction_shape}` and `{opclass}` defines the instruction size and type. The table below lists valid combinations. `{element_a}`, `{element_b}` define the source operand data type for each instructions, and `{element_acc}` defines the accumulator type. The `{math_operation}` defines the math operation applied.
|Opclass | element_a/element_b | element_acc | instruction_shape | math_operation |
| -- | -- | -- | -- | -- |
| cutlass.OpClass.TensorOp | cutlass.float64 | cutlass.float64 | [8, 8, 4] | MathOperation.multiply_add|
| | cutlass.float32 cutlass.tfloat32, cutlass.float16 cutlass.bfloat16 | cutlass.float32 | [16, 8, 8] | MathOperation.multiply_add MathOperation.multiply_add_fast_f32 MathOperation.multiply_add_fast_f16 MathOperation.multiply_add_fast_bf16 |
| | cutlass.float16 | cutlass.float16/cutlass.float32|[16, 8, 16]| MathOperation.multiply_add |
| | cutlass.bfloat_16 | cutlass.float32 | [16, 8, 16]|MathOperation.multiply_add |
| | cutlass.int8 | cutlass.int32 | [16, 8, 32] | MathOperation.multiply_add_saturate|
|cutlass.OpClass.Simt| cutlass.float64 | cutlass.float64 | [1, 1, 1] | MathOperation.multiply_add |
| | cutlass.float32 | cutlass.float32 | [1, 1, 1] | MathOperation.multiply_add |
The `cutlass.OpClass.TensorOp` indicates that the tensor core is used, while `cutlass.OpClass.Simt` uses the SIMT Core.
The `multiply_add_fast_f32` emulates fast accurate SGEMM kernel which is accelerated
using Ampere Tensor Cores. More details can be found in [examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm](examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm).
### Tile Description
The tile description describes the threadblock and warp tiling sizes, as well as the pipeline stages.
```python
tile_description = TileDescription(
{threadblock_shape}, {stages}, {warp_count},
math_inst
)
```
The `{threadblock_shape}` is a list of 3 integers `[Tile_M, Tile_N, Tile_K]` that defines the threadblock tiling size. `{stages}` defines the number of software pipeline stages ([detail](https://developer.nvidia.com/blog/controlling-data-movement-to-boost-performance-on-ampere-architecture/)). `{warp_count}` defines the number of warps along `M`, `N`, and `K` dimension. I.e., with `{threadblock_shape}=[Tile_M, Tile_N, Tile_K]` and `{warp_count}=[W_M, W_N, W_K]`, the warp tile size would be `[Tile_M / W_M, Tile_N / W_N, Tile_K / W_K]`.
### Operand Description
The Operand Description defines the data type, layout, and memory alignment of input tensor A, B, and C. The output D shares the same attributes with C. The description is as follows:
```python
A = TensorDescription(
{element_a}, {layout_a}, {alignment_a}
)
B = TensorDescription(
{element_b}, {layout_b}, {alignment_b}
)
C = TensorDescription(
{element_c}, {layout_c}, {alignment_c}
)
```
The table below lists the supported layout and data types for each operation
| Operation | data type | layout |
| -- | -- | -- |
| GEMM, GEMM Grouped | cutlass.float64, cutlass.float32, cutlass.float16, cutlass.bfloat16 | cutlass.RowMajor, cutlass.ColumnMajor |
| | cutlass.int8 | cutlass.RowMajor, cutlass.ColumnMajor, cutlass.RowMajorInterleaved32, cutlass.ColumnMajorInterleaved32|
| Conv2d Fprop, Dgrad, Wgrad | cutlass.float64, cutlass.float32, cutlass.float16, cutlass.bfloat16 | cutlass.TensorNHWC |
| Conv2d Fprop | cutlass.int8 | cutlass.TensorNHWC, cutlass.TensorNC32HW32, cutlass.TensorC32RSK32|
### Epilogue Functor
The epilogue functor defines the epilogue executed after mainloop.
We expose the following epilogue functors.
| Epilogue Functor | Remark |
| -- | -- |
| LinearCombination | $D=\alpha \times Accm + \beta \times C$ |
| LinearCombinationClamp | $D=\alpha \times Accm + \beta \times C$, Output is clamped to the maximum value of the data type output |
| FastLinearCombinationClamp | $D=\alpha \times Accm + \beta \times C$, only used for problem size $K\le 256$ for cutlass.int8, with accumulator data type `cutlass.int32` and epilogue compute data type `cutlass.float32` |
| LinearCombinationGeneric | $D = activation(\alpha \times Accm + \beta \times C)$, available activations include `relu`, `leaky_relu`, `tanh`, `sigmoid`, `silu`, `hardswish`, and `gelu` |
The epilogue functors can be created as follows
```python
# LinearCombination
epilogue_functor = LinearCombination(
element_C, alignment_c, element_acc, element_epilogue_compute
)
# LinearCombinationClamp
epilogue_functor = LinearCombinationClamp(
element_C, alignment_c, element_acc, element_epilogue_compute
)
# FastLinearCombinationClamp
epilogue_functor = FastLinearCombinationClamp(
element_C, alignment_c
)
# LinearCombinationGeneric
epilogue_functor = LinearCombinationGeneric(
relu(element_epilogue_compute), element_C, alignment_c,
element_acc, element_epilogue_compute
)
```
We also provides an experimental feature "Epilogue Visitor Tree" for GEMM operation. The details can be found in [EpilogueVisitorTree](tools/library/scripts/pycutlass/docs/source/md/EpilogueVisitorTree.md).
### GEMM Operation
The GEMM Operation description can be created with
```python
operation = GemmOperationUniversal(
{compute_capability}, tile_description,
A, B, C, epilogue_functor,
{swizzling_functor}, {visitor}
)
```
* `{compute_capability}` is an integer indicates the compute capability of the GPU. For A100, it is 80.
* `{swizzling_functor}` describes how threadblocks are scheduled on GPU. This is used to improve the L2 Locality ([detail](https://developer.nvidia.com/blog/optimizing-compute-shaders-for-l2-locality-using-thread-group-id-swizzling/)). Currently we support `cutlass.{IdentitySwizzle1|IdentitySwizzle2|IdentitySwizzle4|IdentitySwizzle8|BatchedIdentitySwizzle}`. The last one is used for batched or array GEMM.
* `{visitor}`: a bool variable indicates whether the epilogue visitor tree is used.
### GEMM Grouped Operation
The GEMM Grouped Operation description can be created with
```python
operation = GemmOperationGrouped(
compute_capability, tile_description,
A, B, C, epilogue_functor,
swizzling_functor, {precompute_mode}
)
```
* `{precompute_mode}`: It could be `SchedulerMode.Host` or `SchedulerMode.Device`. See [examples/24_gemm_grouped](examples/24_gemm_grouped) for more details.
### Conv2d Operation
The Conv2d Operation description can be created with
```python
operation = Conv2dOperation(
{conv_kind}, {iterator_algorithm},
compute_capability, tile_description,
A, B, C, {stride_support},
epilogue_functor, swizzling_functor
)
```
* `{conv_kind}` defines which convolution is executed. Available options include `fprop`, `dgrad`, and `wgrad`.
* `{iterator_algorithm}` specifies the iterator algorithm used by the implicit GEMM in convolution. The options are as follows:
* `analytic`: functionally correct in all cases but lower performance
* `optimized`: optimized for R <= 32, S <= 32 and unity-stride dgrad
* `fixed_channels`: analytic algorithm optimized for fixed channel count (C == AccessSize)
* `few_channels`: Analytic algorithm optimized for few channels (C divisible by AccessSize)
* `{stride_support}`: distinguishes among partial specializations that accelerate certain problems where convolution
stride is unit.
* `strided`: arbitrary convolution stride
* `unity`: unit convolution stride
***
## Code Emission and Compilation
After implementing the operation description, the related host and device code can be compiled with
```python
import pycutlass
pycutlass.compiler.add_module([operation,])
```
Several operations can be compiled together. The `nvcc` at `$CUDA_INSTALL_PATH/bin` is used by default as the compiler backend. But you can also switch to [CUDA Python](https://nvidia.github.io/cuda-python/overview.html)'s `nvrtc` with
```python
pycutlass.compiler.nvrtc()
```
We also have an internal compiled artifact manager that caches the compiled kernel in both memory and disk. The `compiled_cache.db` at your workspace is the database that contains the binary files. You can delete the file if you want to recompile the kernels.
***
## Argument Processing
We provide argument wrapper to convert python tensors to the kernel parameters. Currently it supports [torch.Tensor](https://pytorch.org/), [numpy.ndarray](https://numpy.org/), and [cupy.ndarray](https://cupy.dev/).
### GEMM Arguments
The Gemm arguments can be created with
```python
arguments = GemmArguments(
operation=operation, problem_size={problem_size},
A={tensor_A}, B={tensor_B}, C={tensor_C}, D={tensor_D},
output_op={output_op},
gemm_mode={gemm_mode},
split_k_slices={split_k_slices}, batch={batch}
)
```
* `problem_size` is a `cutlass.gemm.GemmCoord(M, N, K)` object that defines $M\times N\times K$ matrix multiplication.
* `tensor_X`: user-provide tensors.
* `output_op`: the params for the epilogue functor.
* `gemm_mode`, `split_k_slices`, and `batch`:
|gemm_mode| split_k_slices | batch | remark|
|--|--|--|--|
|cutlass.gemm.Mode.Gemm | number of split-K slices | - | the ordinary GEMM or GEMM with serial split-K|
|cutlass.gemm.Mode.GemmSplitKParallel | number of split-K slices | - | GEMM Split-K Parallel|
|cutlass.gemm.Mode.Batched | - | batch size | Batched GEMM |
|cutlass.gemm.Mode.Array | - | batch size | Array GEMM |
### GEMM Grouped Arguments
The GEMM grouped arguments can be created with
```python
arguments = GemmGroupedArguments(
operation, {problem_sizes_coord}, {tensor_As}, {tensor_Bs}, {tensor_Cs}, {tensor_Ds},
output_op=output_op)
)
```
* `problem_size_coord` is a list of `cutlass.gemm.GemmCoord(M, N, K)` for each problem size.
* `tensor_Xs` is a list of user-provide tensors.
* `output_op`: the params of the epilogue functor
### Conv2d Arguments
The Conv2d arguments can be created with
```python
arguments = Conv2dArguments(
operation, {problem_size}, {tensor_A},
{tensor_B}, {tensor_C}, {tensor_D},
{output_op},
{split_k_mode},
{split_k_slices}
)
```
* `problem_size`: it can be constructed with
```python
problem_size = cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(N, H, W, C),
cutlass.Tensor4DCoord(K, R, S, C),
cutlass.Tensor4DCoord(pad[0], pad[1], pad[2], pad[3]),
cutlass.MatrixCoord(stride[0], stride[1]),
cutlass.MatrixCoord(dilation[0], dilation[1]),
cutlass.conv.Mode.cross_correlation,
split_k_slices, 1
)
```
* `tensor_X` are user-provide tensors
* `output_op`: the params of the epilogue functor
* `split_k_mode`: currently we support `cutlass.conv.SplitKMode.Serial` and `cutlass.conv.SplitKMode.Parallel`.
* `split_k_slice`: number of split-k slices
For ordinary conv2d, just use `cutlass.conv.SplitKMode.Serial` with `split_k_slice=1`.
### Getting output_op
The way to create output_op is listed below
```python
output_op = operation.epilogue_type(*([alpha, beta] + args.activation_args)),
```
It is a list of arguments start with the scaling factor `alpha` and `beta`.
The `output_op` of EpilogueVisitorTree is slightly different. Please check [EpilogueVisitorTree](tools/library/scripts/pycutlass/docs/source/md/EpilogueVisitorTree.md) for details.
## Kernel Launching
With the arguments and operations, the kernel can be launched simply with
```python
operation.run(arguments)
```
## Sync results
We also provide function to synchronize the kernel execution. If you use `numpy`, it will also copy the result back to host. To do that, run
```python
arguments.sync()
```
If you use EpilogueVisitorTree, please call
```python
output_op.sync()
```
## Reduction Kernel behind Parallel Split-K
If you use parallel-split-K in GEMM or Conv2d, an additional reduction kernel is required. Please check [examples/40_cutlass_py](examples/40_cutlass_py) for detail.

View File

@ -1,4 +0,0 @@
User Guide
=====================================
.. mdinclude:: ./md/basic_idea.md

View File

@ -1,4 +0,0 @@
User Guide
=====================================
.. mdinclude:: ./md/EpilogueVisitorTree.md

View File

@ -1,106 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from pycutlass import *
import pycutlass
from pycutlass.epilogue import LinearCombination
from pycutlass.test.conv2d_testbed import Conv2dLauncher
if __name__ == "__main__":
pycutlass.get_memory_pool(2**33, 2**33)
pycutlass.compiler.nvcc()
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=4,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(cutlass.float32, 4, cutlass.float32, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
profiler = Conv2dLauncher(operation, verification=False, profiling=True)
python_runtime = profiler.run(
problem_size = cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(32, 224, 224, 128),
cutlass.Tensor4DCoord(128, 3, 3, 128),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
), split_k_mode=cutlass.conv.SplitKMode.Serial
)
cpp_runtime = profiler.run_cutlass_profiler(
problem_size = cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(32, 224, 224, 128),
cutlass.Tensor4DCoord(128, 3, 3, 128),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
), split_k_mode=cutlass.conv.SplitKMode.Serial
)
print(cpp_runtime / python_runtime)

View File

@ -1,91 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import pycutlass
from pycutlass import *
from pycutlass.test import *
from pycutlass.test.gemm_testbed import GemmUniversalLauncher
if __name__ == '__main__':
pycutlass.get_memory_pool(2**32, 2**32)
pycutlass.compiler.nvcc()
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[256, 128, 32],
stages=3, warp_count=[4, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=4
)
B = TensorDescription(
element=cutlass.float16, layout=cutlass.RowMajor,
alignment=4
)
C = TensorDescription(
element=cutlass.float32, layout=cutlass.ColumnMajor,
alignment=4
)
element_epilogue = cutlass.float32
epilogue_functor = LinearCombination(cutlass.float32, 4, cutlass.float32, cutlass.float32)
swizzling_functor = cutlass.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C, element_epilogue=element_epilogue,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
profiler = GemmUniversalLauncher(operation, verification=False, profiling=True)
python_runtime = profiler.run(
mode=cutlass.gemm.Mode.Gemm,
problem_size=cutlass.gemm.GemmCoord(4096, 4096, 4096)
)
cpp_runtime = profiler.run_cutlass_profiler(
mode=cutlass.gemm.Mode.Gemm,
problem_size=cutlass.gemm.GemmCoord(4096, 4096, 4096),
)
print(cpp_runtime / python_runtime)

View File

@ -1,9 +0,0 @@
[build-system]
requires = [
"setuptools",
"scikit-build>0.13.1",
"pybind11",
"numpy<1.23",
"cmake>=3.20.1,!=3.23.0"
]

View File

@ -1,116 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import distutils.cmd
from setuptools import setup
import setuptools.command.build_py
import os
# build rmm dependency
class BuildRMM(distutils.cmd.Command):
user_options = []
def initialize_options(self):
pass
def finalize_options(self):
pass
def run(self):
try:
import rmm
except ImportError:
print("installing rmm")
os.system("git clone -b branch-22.10 --recurse-submodules https://github.com/rapidsai/rmm.git")
os.chdir("./rmm")
os.system("./build.sh librmm rmm")
os.chdir("./python")
os.system("python setup.py build_ext --inplace")
os.system("python setup.py install")
cutlass_path = os.getenv('CUTLASS_PATH')
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
ext_modules = []
try:
from pybind11.setup_helpers import Pybind11Extension, build_ext
include_dirs = [
cutlass_path + "/include",
cuda_install_path + "/include",
cutlass_path + "/tools/util/include",
cutlass_path + "/test",
cutlass_path + "/tools/library/scripts/pycutlass/googletest/googletest/include"
]
ext_modules = [
Pybind11Extension("cutlass",
["src/cpp/cutlass.cpp"],
include_dirs=include_dirs,
extra_compile_args=["-fpermissive", "-w", "-std=c++17"]),
Pybind11Extension("cute",
["src/cpp/cute.cpp"],
include_dirs=include_dirs,
extra_compile_args=["-fpermissive", "-w", "-std=c++17"])
]
except ImportError:
pass
setup(
name="PyCutlass",
version="0.0.1",
author="Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall",
author_email="zhaodongc@nvidia.com",
description="Python interface for CUTLASS",
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
package_dir={"": "src"},
packages=['pycutlass', 'pycutlass.utils', 'pycutlass.test'],
setup_requires=["pybind11", "numpy<1.23"],
install_requires=[
"numpy<1.23",
'pybind11',
'cuda-python>=11.8.0',
'typeguard',
'bfloat16',
'typing',
'scikit-build',
'treelib'
],
cmdclass={
'rmm': BuildRMM
},
ext_modules=ext_modules,
python_requires=">=3.6",
)

View File

@ -1,75 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief In-memory compiled artifact cache
*/
#include <pybind11/pybind11.h>
#include <string>
#include <unordered_map>
namespace py = pybind11;
namespace cutlass {
struct CompileCache {
public:
CompileCache() = default;
~CompileCache() = default;
using Cache = std::unordered_map<std::string, py::object>;
/// Check if the kernel has already been compiled
py::object at(const std::string &kernel) {
auto item = cache_.find(kernel);
if (item != cache_.end()) {
return item->second;
}
return py::none();
}
/// Insert a new compiled kernel for new configuration
void insert(const std::string &kernel, const py::object &compiled_kernel){
cache_.emplace(kernel, compiled_kernel);
}
const int64_t size() const { return cache_.size(); }
/// Clear the cache
void clear() { cache_.clear(); }
private:
Cache cache_;
};
} // namespace cutlass

View File

@ -1,54 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief binding CuTe C++ APIs to Python
*/
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cute/arch/mma_sm90_gmma.hpp"
namespace py = pybind11;
PYBIND11_MODULE(cute, m) {
// module doc
m.doc() = "CuTe C++ bindings";
py::enum_<cute::GMMA::Major>(m, "GMMAMajor",
R"pbdoc(classification of CuTe GMMA tensor major specification)pbdoc")
.value("K", cute::GMMA::Major::K,
R"pbdoc(Tensor is contiguous in reduction dimension)pbdoc")
.value("MN", cute::GMMA::Major::MN,
R"pbdoc(Tensor is contiguous in non-reduction dimension)pbdoc");
}

View File

@ -1,182 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief binding CUTLASS C++ APIs to Python
*/
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "builtin_types.h"
#include "device_launch_parameters.h"
#include "stddef.h"
#include "cutlass/cutlass.h"
#include "include/conv/convolution.h"
#include "include/gemm/gemm.h"
#include "include/types.h"
#include "include/layout/layout.h"
#include "include/tensor_coord.h"
#include "include/arch.h"
#include "include/tensor_ref_view.h"
#include "include/swizzling.h"
#include "test/conv/convolution.h"
#include "test/gemm/gemm.h"
// Data Types
#include "library.h"
// compiler
#include "compiler.h"
namespace py = pybind11;
PYBIND11_MODULE(cutlass, m) {
// module doc
m.doc() = "cutlass C++ binding";
//
// Bind data type
//
bind_cutlass_types(m);
//
// Bind layout
//
bind_layout(m);
//
// Bind tensor coord
//
bind_tensor_coord(m);
//
// Bind tensor ref
//
bind_tensor_refs_and_views(m);
//
// Bind opcode
//
bind_opcode(m);
//
// Bind convolution
//
py::module_ conv_submodule = m.def_submodule("conv");
bind_convolution(conv_submodule);
//
// Bind gemm
//
py::module_ gemm_submodule = m.def_submodule("gemm");
bind_gemm(gemm_submodule);
//
// Bind swizzling
//
bind_threadblock_swizzle(m);
//
// Bind test units
//
py::module_ test = m.def_submodule("test");
py::module_ test_conv = test.def_submodule("conv");
bind_convolution_test(test_conv);
py::module_ test_gemm = test.def_submodule("gemm");
bind_gemm_test(test_gemm);
// data types
py::enum_<cutlass::DataType>(m, "dtype")
.value("b1", cutlass::DataType::kB1)
.value("u2", cutlass::DataType::kU2)
.value("u4", cutlass::DataType::kU4)
.value("u8", cutlass::DataType::kU8)
.value("u16", cutlass::DataType::kU16)
.value("u32", cutlass::DataType::kU32)
.value("u64", cutlass::DataType::kU64)
.value("s2", cutlass::DataType::kS2)
.value("s4", cutlass::DataType::kS4)
.value("s16", cutlass::DataType::kS16)
.value("s64", cutlass::DataType::kS64)
.value("cf16", cutlass::DataType::kCF16)
.value("cbf16", cutlass::DataType::kCBF16)
.value("cf32", cutlass::DataType::kCF32)
.value("ctf32", cutlass::DataType::kCTF32)
.value("cf64", cutlass::DataType::kCF64)
.value("cs2", cutlass::DataType::kCS2)
.value("cs4", cutlass::DataType::kCS4)
.value("cs8", cutlass::DataType::kCS8)
.value("cs16", cutlass::DataType::kCS16)
.value("cs32", cutlass::DataType::kCS32)
.value("cs64", cutlass::DataType::kCS64)
.value("cu2", cutlass::DataType::kCU2)
.value("cu4", cutlass::DataType::kCU4)
.value("cu8", cutlass::DataType::kCU8)
.value("cu16", cutlass::DataType::kCU16)
.value("cu32", cutlass::DataType::kCU32)
.value("cu64", cutlass::DataType::kCU64)
.value("invalid", cutlass::DataType::kInvalid);
// layout types
py::enum_<cutlass::LayoutType>(m, "layout")
.value("ColumnMajorInterleaved2", cutlass::LayoutType::kColumnMajorInterleaved2)
.value("RowMajorInterleaved2", cutlass::LayoutType::kRowMajorInterleaved2)
.value("ColumnMajorInterleaved64", cutlass::LayoutType::kColumnMajorInterleaved64)
.value("RowMajorInterleaved64", cutlass::LayoutType::kRowMajorInterleaved64)
.value("TensorNDHWC", cutlass::LayoutType::kTensorNDHWC)
.value("TensorNCHW", cutlass::LayoutType::kTensorNCHW)
.value("TensorNGHWC", cutlass::LayoutType::kTensorNGHWC)
.value("TensorNC64HW64", cutlass::LayoutType::kTensorNC64HW64)
.value("TensorC64RSK64", cutlass::LayoutType::kTensorC64RSK64);
// transform types
py::enum_<cutlass::ComplexTransform>(m, "complex_transform")
.value("none", cutlass::ComplexTransform::kNone)
.value("conj", cutlass::ComplexTransform::kConjugate);
//
// Compiler
//
py::class_<cutlass::CompileCache>(m, "CompileCache")
.def(py::init<>())
.def("at", &cutlass::CompileCache::at)
.def("insert", &cutlass::CompileCache::insert)
.def("size", &cutlass::CompileCache::size)
.def("clear", &cutlass::CompileCache::clear);
}

View File

@ -1,59 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind opcode classes to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/arch/mma.h"
namespace py = pybind11;
namespace cutlass {
enum class OpcodeClass {
kSimt, kTensorOp, kWmmaTensorOp, kSparseTensorOp
};
}
void bind_opcode(py::module &m) {
py::enum_<cutlass::OpcodeClass>(m, "OpClass",
R"pbdoc(classification of math operators)pbdoc")
.value("Simt", cutlass::OpcodeClass::kSimt,
R"pbdoc(Tag classifying math operators as thread-level operations)pbdoc")
.value("TensorOp", cutlass::OpcodeClass::kTensorOp,
R"pbdoc(Tag classifying operators as Tensor Core operations)pbdoc")
.value("WmmaTensorOp", cutlass::OpcodeClass::kWmmaTensorOp,
R"pbdoc(Tag classifying operators as WMMA Tensor Core operations)pbdoc")
.value("SparseTensorOp", cutlass::OpcodeClass::kSparseTensorOp,
R"pbdoc(Tag classifying operators as sparseTensor Core operations)pbdoc");
}

View File

@ -1,102 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind Convolution problem sizes to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/conv/conv2d_problem_size.h"
namespace py = pybind11;
void bind_conv_problem_size(py::module &m) {
//
// Conv2d Problem Size:
// include/cutlass/conv/conv2d_problem_size.h
//
py::class_<cutlass::conv::Conv2dProblemSize>(m, "Conv2dProblemSize")
// constructors
.def(py::init<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, cutlass::conv::Mode, int, int>())
.def(py::init<cutlass::Tensor4DCoord, cutlass::Tensor4DCoord, cutlass::Tensor4DCoord, cutlass::MatrixCoord, cutlass::MatrixCoord, cutlass::conv::Mode, int, int>())
// attribute accessors
.def_readwrite("N", &cutlass::conv::Conv2dProblemSize::N)
.def_readwrite("H", &cutlass::conv::Conv2dProblemSize::H)
.def_readwrite("W", &cutlass::conv::Conv2dProblemSize::W)
.def_readwrite("C", &cutlass::conv::Conv2dProblemSize::C)
.def_readwrite("P", &cutlass::conv::Conv2dProblemSize::P)
.def_readwrite("Q", &cutlass::conv::Conv2dProblemSize::Q)
.def_readwrite("K", &cutlass::conv::Conv2dProblemSize::K)
.def_readwrite("R", &cutlass::conv::Conv2dProblemSize::R)
.def_readwrite("S", &cutlass::conv::Conv2dProblemSize::S)
.def_readwrite("pad_h", &cutlass::conv::Conv2dProblemSize::pad_h)
.def_readwrite("pad_w", &cutlass::conv::Conv2dProblemSize::pad_w)
.def_readwrite("stride_h", &cutlass::conv::Conv2dProblemSize::stride_h)
.def_readwrite("stride_w", &cutlass::conv::Conv2dProblemSize::stride_w)
.def_readwrite("dilation_h", &cutlass::conv::Conv2dProblemSize::dilation_h)
.def_readwrite("dilation_w", &cutlass::conv::Conv2dProblemSize::dilation_w)
.def_readwrite("mode", &cutlass::conv::Conv2dProblemSize::mode)
.def_readwrite("split_k_slices", &cutlass::conv::Conv2dProblemSize::split_k_slices)
.def_readwrite("groups", &cutlass::conv::Conv2dProblemSize::groups)
// functions
.def("reset_split_k_slices", &cutlass::conv::Conv2dProblemSize::reset_split_k_slices)
.def("activation_extent", &cutlass::conv::Conv2dProblemSize::activation_extent)
.def("filter_extent", &cutlass::conv::Conv2dProblemSize::filter_extent)
.def("output_extent", &cutlass::conv::Conv2dProblemSize::output_extent)
.def("activation_size", &cutlass::conv::Conv2dProblemSize::activation_size)
.def("filter_size", &cutlass::conv::Conv2dProblemSize::filter_size)
.def("output_size", &cutlass::conv::Conv2dProblemSize::output_size);
// Get tensor size
m.def("implicit_gemm_tensor_a_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_a_size));
m.def("implicit_gemm_tensor_b_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_b_size));
m.def("implicit_gemm_tensor_c_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_c_size));
// Get tensor extent
m.def("implicit_gemm_tensor_a_extent",
py::overload_cast<
cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
>(&cutlass::conv::implicit_gemm_tensor_a_extent));
m.def("implicit_gemm_tensor_b_extent",
py::overload_cast<
cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
>(&cutlass::conv::implicit_gemm_tensor_b_extent));
m.def("implicit_gemm_tensor_c_extent",
py::overload_cast<
cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
>(&cutlass::conv::implicit_gemm_tensor_c_extent));
m.def("implicit_gemm_problem_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize &>(&cutlass::conv::implicit_gemm_problem_size));
}

View File

@ -1,91 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind convolution related enum types to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "conv_problem_size.h"
#include "host.h"
#include "cutlass/conv/convolution.h"
namespace py = pybind11;
void bind_convolution(py::module &m) {
//
// Enumerate types
// cutlass/include/cutlass/conv/convolution.h
//
/// Convolutional operator
py::enum_<cutlass::conv::Operator>(m, "Operator", R"pbdoc(Convolutional operator)pbdoc")
.value("fprop", cutlass::conv::Operator::kFprop, "Forward propagation")
.value("dgrad", cutlass::conv::Operator::kDgrad, "Activation grad")
.value("wgrad", cutlass::conv::Operator::kWgrad, "Weight grad");
/// Distinguishes convolution from cross correlation
py::enum_<cutlass::conv::Mode>(m, "Mode")
.value("cross_correlation", cutlass::conv::Mode::kCrossCorrelation)
.value("convolution", cutlass::conv::Mode::kConvolution);
/// Selects among several implementation variants trading off performance with simplicity
py::enum_<cutlass::conv::IteratorAlgorithm>(m, "IteratorAlgorithm",
R"pbdoc(Selects among several implementation variants trading off performance with simplicity)pbdoc")
.value("analytic", cutlass::conv::IteratorAlgorithm::kAnalytic, R"pbdoc(functionally correct in all cases but lower performance)pbdoc")
.value("optimized", cutlass::conv::IteratorAlgorithm::kOptimized, R"pbdoc(optimized for R <= 32, S <= 32 and unity-stride dgrad)pbdoc")
.value("fixed_channels", cutlass::conv::IteratorAlgorithm::kFixedChannels, R"pbdoc(Analytic algorithm optimized for fixed channel count (C == AccessSize))pbdoc")
.value("few_channels", cutlass::conv::IteratorAlgorithm::kFewChannels, R"pbdoc(Analytic algorithm optimized for few channels (C divisible by AccessSize))pbdoc");
/// Distinguishes among partial specializations that accelerate certain problems where convolution
/// stride is unit.
py::enum_<cutlass::conv::StrideSupport>(m, "StrideSupport",
R"pbdoc(Distinguishes among partial specializations that accelerate certain problems where convolution
stride is unit.)pbdoc")
.value("strided", cutlass::conv::StrideSupport::kStrided, R"pbdoc(arbitrary convolution stride)pbdoc")
.value("unity", cutlass::conv::StrideSupport::kUnity, R"pbdoc(unit convolution stride)pbdoc");
/// Identifies split-K mode
py::enum_<cutlass::conv::SplitKMode>(m, "SplitKMode")
.value("None", cutlass::conv::SplitKMode::kNone)
.value("Serial", cutlass::conv::SplitKMode::kSerial)
.value("Parallel", cutlass::conv::SplitKMode::kParallel);
// Conv problem sizes
bind_conv_problem_size(m);
//
// host helper functions
//
py::module_ host_submodule = m.def_submodule("host");
bind_conv_host_helper(host_submodule);
}

View File

@ -1,54 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind conv host helpers to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/util/host_reorder.h"
#include "cutlass/layout/tensor.h"
namespace py = pybind11;
void bind_conv_host_helper(py::module &m) {
/// reorder operand B for interleaved layout
m.def("reorder_convK", [](
cutlass::TensorRef<int8_t, cutlass::layout::TensorCxRSKx<32>> dest,
cutlass::TensorRef<int8_t, cutlass::layout::TensorCxRSKx<32>> src,
cutlass::conv::Operator conv_op, const cutlass::conv::Conv2dProblemSize & problem_size) {
cutlass::gemm::GemmCoord implicit_problem_size = cutlass::conv::implicit_gemm_problem_size(conv_op, problem_size);
cutlass::reorder_convK<32>(dest, src, implicit_problem_size);
});
}

View File

@ -1,222 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A generic wrapper around an epilogue visitor operation
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/arch/memory.h"
#include "cutlass/arch/memory_sm75.h"
#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
#include "cutlass/gemm/kernel/default_gemm.h"
#include "cutlass/gemm/kernel/default_gemm_complex.h"
#include "cutlass/gemm/device/default_gemm_configuration.h"
#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
#include "epilogue_visitor_op/visitor_op_linear_combination.h"
#include "epilogue_visitor_op/visitor_op_tensor_input.h"
#include "epilogue_visitor_op/visitor_op_accumulator.h"
#include "epilogue_visitor_op/visitor_op_row_broadcast.h"
#include "epilogue_visitor_op/visitor_op_tensor_output.h"
#include "epilogue_visitor_op/visitor_op_column_reduction.h"
#include "epilogue_visitor_op/visitor_op_row_reduction.h"
#include "epilogue_visitor_op/visitor_op_column_broadcast.h"
#include "epilogue_visitor_op/visitor_op_unary.h"
#include "epilogue_visitor_op/visitor_op_binary.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace epilogue {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Generic Epilogue Visitor.
template <
typename OutputOp_
>
class EpilogueVisitorGeneric {
public:
using OutputOp = OutputOp_;
using AccumulatorAccessType = typename OutputOp::AccumulatorAccessType;
static int const kElementsPerAccess = OutputOp::kElementsPerAccess;
using ElementOutput = typename OutputOp::ElementOutput;
using OutputTileIterator = typename OutputOp::OutputTileIterator;
static int const kIterations = OutputTileIterator::kIterations;
///
/// End Epilogue Tree
///
/// Additional SMEM bufer is not required in the broadcast epilogue visitor
struct SharedStorage {
typename OutputOp::SharedStorage output_smem;
CUTLASS_HOST_DEVICE
SharedStorage() { }
};
public:
/// Argument structure
struct Arguments {
typename OutputOp::Arguments output_op_args;
//
// Methods
//
Arguments() { }
Arguments(
typename OutputOp::Arguments output_op_args
):
output_op_args(output_op_args)
{
}
};
struct Params {
typename OutputOp::Params output_op_params;
//
// Methods
//
CUTLASS_HOST_DEVICE
Params() { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args):
output_op_params(args.output_op_args)
{
}
};
private:
OutputOp output_op;
public:
/// Constructor
CUTLASS_DEVICE
EpilogueVisitorGeneric(
Params const &params, ///< Parameters routed to the epilogue
SharedStorage &shared_storage, ///< Shared storage needed by the functors here
MatrixCoord threadblock_offset,
gemm::GemmCoord threadblock_tile_offset,
int thread_idx,
MatrixCoord problem_size
):
output_op(params.output_op_params, shared_storage.output_smem, thread_idx, threadblock_offset, problem_size)
{ }
/// Helper to indicate split-K behavior
CUTLASS_DEVICE
void set_k_partition(
int split_k_index, ///< Index of this threadblock within split-K partitioned scheme
int split_k_slices) { ///< Total number of split-K slices
}
/// Called to set the batch index
CUTLASS_DEVICE
void set_batch_index(int batch_idx) {
output_op.set_batch_index(batch_idx);
}
/// Called at the start of the epilogue just before iterating over accumulator slices
CUTLASS_DEVICE
void begin_epilogue() {
output_op.begin_epilogue();
}
/// Called at the start of one step before starting accumulator exchange
CUTLASS_DEVICE
void begin_step(int step_idx) {
output_op.begin_step(step_idx);
}
/// Called at the start of a row
CUTLASS_DEVICE
void begin_row(int row_idx) {
output_op.begin_row(row_idx);
}
/// Called after accumulators have been exchanged for each accumulator vector
CUTLASS_DEVICE
void visit(
int iter_idx,
int row_idx,
int column_idx,
int frag_idx,
AccumulatorAccessType const &accum) {
output_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
}
/// Called at the start of a row
CUTLASS_DEVICE
void end_row(int row_idx) {
output_op.end_row(row_idx);
}
/// Called after all accumulator elements have been visited
CUTLASS_DEVICE
void end_step(int step_idx) {
output_op.end_step(step_idx);
}
/// Called after all steps have been completed
CUTLASS_DEVICE
void end_epilogue() {
output_op.end_epilogue();
}
};
////////////////////////////////////////////////////////////////////////////////
} // namespace threadblock
} // namespace epilogue
} // namespace cutlass
////////////////////////////////////////////////////////////////////////////////

View File

@ -1,84 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A file contains the binary ops
*/
#pragma once
#include "cutlass/cutlass.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Scalar multiplication
template <typename T, int N>
struct VectorAdd {
struct Arguments {
int tmp;
CUTLASS_HOST_DEVICE
Arguments():tmp(0){ }
CUTLASS_HOST_DEVICE
Arguments(int tmp): tmp(tmp) { }
};
struct Params {
CUTLASS_HOST_DEVICE
Params(Arguments const &args) { }
};
CUTLASS_HOST_DEVICE
VectorAdd(
Params const &params
) { }
CUTLASS_HOST_DEVICE
Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
cutlass::plus<Array<T, N>> add_op;
return add_op(lhs, rhs);
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,233 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A file contains the unary ops
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/epilogue/thread/activation.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Scalar multiplication
template <typename T, int N>
struct Mult {
struct Arguments {
T alpha;
CUTLASS_HOST_DEVICE
Arguments():alpha(T(1.0)){ }
CUTLASS_HOST_DEVICE
Arguments(T alpha): alpha(alpha) { }
};
struct Params {
T alpha; ///< scales accumulators
CUTLASS_HOST_DEVICE
Params():alpha(T(1.0)){ }
CUTLASS_HOST_DEVICE
Params(Arguments const &args): alpha(args.alpha) { }
};
T alpha_;
CUTLASS_HOST_DEVICE
Mult(
Params const &params
):
alpha_(params.alpha)
{ }
CUTLASS_HOST_DEVICE
Array<T, N> operator()(Array<T, N> const &source) const {
cutlass::multiplies<Array<T, N>> multiply_op;
return multiply_op(source, alpha_);
}
CUTLASS_HOST_DEVICE
bool guard() {
return alpha_ != T(0);
}
};
/// ReLU
template <typename T, int N>
struct ReLUVisitor {
struct Arguments {
T threshold;
CUTLASS_HOST_DEVICE
Arguments():threshold(T(0.0)) { }
CUTLASS_HOST_DEVICE
Arguments(T threshold): threshold(threshold) { }
};
struct Params {
T threshold;
CUTLASS_HOST_DEVICE
Params():threshold(T(0.0)) { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args): threshold(args.threshold) { }
};
T threshold_;
CUTLASS_HOST_DEVICE
ReLUVisitor(Params const &params):
threshold_(params.threshold) { }
CUTLASS_HOST_DEVICE
Array<T, N> operator()(Array<T, N> const &frag) const {
maximum<Array<T, N>> mx;
return mx(frag, threshold_);
}
CUTLASS_HOST_DEVICE
bool guard() {
return true;
}
};
/// leakyReLU
template <typename T, int N>
struct LeakyReLUVisitor {
struct Arguments {
T leaky_alpha;
CUTLASS_HOST_DEVICE
Arguments():leaky_alpha(T(0.0)) { }
CUTLASS_HOST_DEVICE
Arguments(T leaky_alpha): leaky_alpha(leaky_alpha) { }
};
struct Params {
T leaky_alpha;
CUTLASS_HOST_DEVICE
Params():leaky_alpha(T(0.0)) { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args): leaky_alpha(args.leaky_alpha) { }
};
T leaky_alpha_;
CUTLASS_HOST_DEVICE
LeakyReLUVisitor(Params const &params):
leaky_alpha_(params.leaky_alpha) { }
CUTLASS_HOST_DEVICE
Array<T, N> operator()(Array<T, N> const &frag) const {
cutlass::epilogue::thread::LeakyReLU<Array<T, N>> leaky_op;
return leaky_op(frag, leaky_alpha_);
}
CUTLASS_HOST_DEVICE
bool guard() {
return true;
}
};
/// Tanh
template <typename T, int N>
struct TanhVisitor {
/// Argument
struct Arguments {
// a placeholder argument to ensure correctness of ctypes
int tmp;
CUTLASS_HOST_DEVICE
Arguments(): tmp(0) { };
CUTLASS_HOST_DEVICE
Arguments(int tmp): tmp(tmp) { };
};
/// Param
struct Params {
CUTLASS_HOST_DEVICE
Params(){ };
Params(Arguments const &args) { }
};
/// Constructor
CUTLASS_HOST_DEVICE
TanhVisitor(Params const &params) { }
// scalar operator
CUTLASS_HOST_DEVICE
T tanh_op(T const &scalar) const {
return fast_tanh(scalar);
}
/// vector operator
CUTLASS_HOST_DEVICE
Array<T, N> operator()(Array<T, N> const &frag) const {
Array<T, N> y;
CUTLASS_PRAGMA_UNROLL
for (int i=0; i < N; ++i) {
y[i] = tanh_op(frag[i]);
}
return y;
}
CUTLASS_HOST_DEVICE
bool guard() {
return true;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,148 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A file contains the epilogue visitor Op with accumulator
*/
#pragma once
#include "cutlass/cutlass.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace epilogue {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Epilogue Visitor operator for the following Computation
///
/// ElementAccumulator accum;
/// return accum;
///
/// It can only be the leaf node of the epilogue tree
template <
typename ElementAccumulator_, ///< Data type of the Accumulator
int kElementsPerAccess_ ///< Number of elements computed per operation
>
class VisitorOpAccumulator{
public:
using ElementAccumulator = ElementAccumulator_;
static int const kElementsPerAccess = kElementsPerAccess_;
/// Fragment type for Accumulator
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
/// Fragment type returned by this visitor
using VisitAccessType = AccumulatorAccessType;
/// SMEM buffer class required in the epilogue visitor
struct SharedStorage {
CUTLASS_HOST_DEVICE
SharedStorage() {}
};
/// Host-constructable Arguments structure
struct Arguments {
// Note: it is strange that ctypes will return issue with empty arguments
int tmp;
CUTLASS_HOST_DEVICE
Arguments() { }
CUTLASS_HOST_DEVICE
Arguments(int tmp): tmp(tmp) { }
};
/// Parameter structure
struct Params {
CUTLASS_HOST_DEVICE
Params() { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args) { }
};
public:
/// Constructs the function object
CUTLASS_HOST_DEVICE
VisitorOpAccumulator(
Params const &params,
SharedStorage &shared_storage,
int thread_idx,
MatrixCoord threadblock_offset,
MatrixCoord problem_size
) { }
CUTLASS_DEVICE
void set_batch_index(int batch_idx) { }
CUTLASS_DEVICE
void begin_epilogue() { }
CUTLASS_DEVICE
void begin_step(int step_idx) { }
CUTLASS_DEVICE
void begin_row(int row_idx) { }
CUTLASS_DEVICE
VisitAccessType visit(
int iter_idx,
int row_idx,
int column_idx,
int frag_idx,
AccumulatorAccessType const &accum
) {
return accum;
}
CUTLASS_DEVICE
void end_row(int row_idx) { }
CUTLASS_DEVICE
void end_step(int step_idx) { }
CUTLASS_DEVICE
void end_epilogue() { }
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,245 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A file contains the epilogue visitor Op with Binary op
*/
#pragma once
#include "cutlass/cutlass.h"
#include "binary_ops.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace epilogue {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Epilogue Visitor operator for the following computation:
///
/// ElementCompute alpha;
/// ElementCompute beta;
/// ElementCompute C = BinaryOp(alpha * ElementCompute(Visitor_A), beta * ElementCompute(Visitor_B)
/// Return C;
///
template <
typename ElementAccumulator_, ///< Data type of the Accumulator
typename ElementCompute_, ///< Data type used to compute linear combination
int kElementsPerAccess_, ///< Number of elements computed per operation
typename VisitorA_, ///< Child node A
typename VisitorB_, ///< Child node B
template<typename T, int N> typename BinaryOp_
>
class VisitorOpBinary{
public:
using ElementAccumulator = ElementAccumulator_;
using ElementCompute = ElementCompute_;
static int const kElementsPerAccess = kElementsPerAccess_;
using VisitorA = VisitorA_;
using VisitorB = VisitorB_;
/// Fragment type returned from VisitorA.visit
using VisitAccessTypeA = typename VisitorA::VisitAccessType;
using ElementA = typename VisitAccessTypeA::Element;
/// Fragment type returned from VisitorB.visit
using VisitAccessTypeB = typename VisitorB::VisitAccessType;
using ElementB = typename VisitAccessTypeB::Element;
/// Fragment type returned by this visitor
using VisitAccessType = Array<ElementCompute, kElementsPerAccess>;
/// Fragment type of accumulator
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
using BinaryOp = BinaryOp_<ElementCompute, kElementsPerAccess>;
static_assert(kElementsPerAccess==VisitAccessTypeA::kElements, "kElementsPerAccess mismatches with Visitor A");
static_assert(kElementsPerAccess==VisitAccessTypeB::kElements, "kElementsPerAccess mismatches with Visitor B");
/// SMEM buffer class required in the epilogue visitor
struct SharedStorage {
typename VisitorA::SharedStorage storage_a;
typename VisitorB::SharedStorage storage_b;
CUTLASS_HOST_DEVICE
SharedStorage() {}
};
/// Host-constructable Arguments structure
struct Arguments {
typename BinaryOp::Arguments binary_arg;
typename VisitorA::Arguments visitor_a_arg; ///< Argument type for visitor_a
typename VisitorB::Arguments visitor_b_arg; ///< Argument type for visitor_b
//
// Methods
//
CUTLASS_HOST_DEVICE
Arguments():binary_arg() { }
CUTLASS_HOST_DEVICE
Arguments(
typename BinaryOp::Arguments binary_arg,
typename VisitorA::Arguments visitor_a_arg,
typename VisitorB::Arguments visitor_b_arg
):
binary_arg(binary_arg),
visitor_a_arg(visitor_a_arg),
visitor_b_arg(visitor_b_arg)
{ }
};
/// Parameter structure
struct Params {
typename BinaryOp::Params binary_param;
typename VisitorA::Params visitor_a_param; ///< Argument type for visitor_a
typename VisitorB::Params visitor_b_param; ///< Argument type for visitor_b
//
// Methods
//
CUTLASS_HOST_DEVICE
Params() { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args):
binary_param(args.binary_arg),
visitor_a_param(args.visitor_a_arg),
visitor_b_param(args.visitor_b_arg)
{ }
};
private:
//
// Data members
//
BinaryOp binary_op;
VisitorA visitor_a_op;
VisitorB visitor_b_op;
public:
/// Constructs the function object
CUTLASS_HOST_DEVICE
VisitorOpBinary(
Params const &params,
SharedStorage &shared_storage,
int thread_idx,
MatrixCoord threadblock_offset,
MatrixCoord problem_size
):
binary_op(params.binary_param),
visitor_a_op(params.visitor_a_param, shared_storage.storage_a, thread_idx, threadblock_offset, problem_size),
visitor_b_op(params.visitor_b_param, shared_storage.storage_b, thread_idx, threadblock_offset, problem_size)
{ }
CUTLASS_DEVICE
void begin_epilogue() {
visitor_a_op.begin_epilogue();
visitor_b_op.begin_epilogue();
}
CUTLASS_DEVICE
void set_batch_index(int batch_idx) {
visitor_a_op.set_batch_index(batch_idx);
visitor_b_op.set_batch_index(batch_idx);
}
CUTLASS_DEVICE
void begin_step(int step_idx) {
visitor_a_op.begin_step(step_idx);
visitor_b_op.begin_step(step_idx);
}
CUTLASS_DEVICE
void begin_row(int row_idx) {
visitor_a_op.begin_row(row_idx);
visitor_b_op.begin_row(row_idx);
}
CUTLASS_DEVICE
VisitAccessType visit(
int iter_idx,
int row_idx,
int column_idx,
int frag_idx,
AccumulatorAccessType const &accum
) {
/// Get result from visitor A and visitor B
VisitAccessTypeA result_A = visitor_a_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
VisitAccessTypeB result_B = visitor_b_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
/// Type conversion
NumericArrayConverter<ElementCompute, ElementA, kElementsPerAccess> source_converter_A;
NumericArrayConverter<ElementCompute, ElementB, kElementsPerAccess> source_converter_B;
return binary_op(
source_converter_A(result_A),
source_converter_B(result_B)
);
}
CUTLASS_DEVICE
void end_row(int row_idx) {
visitor_a_op.end_row(row_idx);
visitor_b_op.end_row(row_idx);
}
CUTLASS_DEVICE
void end_step(int step_idx) {
visitor_a_op.end_step(step_idx);
visitor_b_op.end_step(step_idx);
}
CUTLASS_DEVICE
void end_epilogue() {
visitor_a_op.end_epilogue();
visitor_b_op.end_epilogue();
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,250 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A file contains the epilogue visitor Op with broadcasting vector to all columns
*/
#pragma once
#include "cutlass/cutlass.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace epilogue {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Epilogue Visitor operator for the following computation:
///
/// ElementVector T[i][j] <- device-memory Td[i]
///
/// It can only be a leaf node in the epilogue tree
template <
typename ElementAccumulator_, ///< Data type of the Accumulator
typename ElementFragment_, ///< Data type used to cache vector in register
typename InputTileIterator_ ///< Tile iterator type to read the broadcasted tensor
>
class VisitorOpColumnBroadcast {
public:
using InputTileIterator = InputTileIterator_;
static int const kElementsPerAccess = InputTileIterator::kElementsPerAccess;
using ElementAccumulator = ElementAccumulator_;
using ElementVector = typename InputTileIterator::Element;
using ElementFragment = ElementFragment_;
using VisitAccessType = Array<ElementFragment, kElementsPerAccess>;
/// Thread map used by input tile iterators
using ThreadMap = typename InputTileIterator::ThreadMap;
/// Fragment object used to store the broadcast values
using BroadcastFragment = Array<
ElementFragment, kElementsPerAccess>;
/// Fragment type of accumulator
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
/// Used for the broadcast
struct BroadcastDetail {
/// Number of threads per warp
static int const kWarpSize = 32;
static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
/// Number of distinct scalar column indices handled by each thread
static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
/// Number of distinct scalar row indices handled by each thread
static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
/// Number of threads per threadblock
static int const kThreadCount = ThreadMap::kThreads;
/// Number of distinct threads per row of output tile
static int const kThreadsPerRow = (InputTileIterator::Shape::kN / kColumnsPerThread);
/// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
static int const kThreadRows = kThreadCount / kThreadsPerRow;
// /// Number of iterations (accesses) the threadblock takes to reduce a row
// static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
};
// using ComputeFragmentType = Array<ElementVector, BroadcastDetail::kElementsPerAccess>;
struct SharedStorage {
CUTLASS_HOST_DEVICE
SharedStorage() { }
};
/// Host-constructable Argument structure
struct Arguments {
ElementVector *broadcast_ptr; ///< Pointer to the additional tensor operand
int64_t batch_stride;
/// Methods
CUTLASS_HOST_DEVICE
Arguments():
broadcast_ptr(nullptr) { }
CUTLASS_HOST_DEVICE
Arguments(
ElementVector *broadcast_ptr,
int64_t batch_stride
):
broadcast_ptr(broadcast_ptr),
batch_stride(batch_stride) { }
};
/// Param structure
struct Params {
ElementVector *broadcast_ptr; ///< Pointer to the additional tensor operand
int64_t batch_stride;
/// Method
CUTLASS_HOST_DEVICE
Params():
broadcast_ptr(nullptr) { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args):
broadcast_ptr(args.broadcast_ptr),
batch_stride(args.batch_stride) { }
};
private:
ElementVector *broadcast_ptr;
BroadcastFragment broadcast_fragment; ///< Array holds the loaded broadcast fragment
MatrixCoord threadblock_offset_;
int thread_idx_;
MatrixCoord problem_size;
int thread_start_row_;
int state_[3];
int thread_offset_row_;
int64_t batch_stride_;
public:
/// Constructs the function object
CUTLASS_HOST_DEVICE
VisitorOpColumnBroadcast(
Params const &params,
SharedStorage &shared_storage,
int thread_idx,
MatrixCoord threadblock_offset,
MatrixCoord problem_size
):
broadcast_ptr(params.broadcast_ptr),
threadblock_offset_(threadblock_offset),
thread_idx_(thread_idx),
problem_size(problem_size),
thread_start_row_(ThreadMap::initial_offset(thread_idx).row() + threadblock_offset.row()),
batch_stride_(params.batch_stride)
{
state_[0] = state_[1] = state_[2] = 0;
}
CUTLASS_DEVICE
void set_batch_index(int batch_idx) {
broadcast_ptr += batch_idx * batch_stride_;
}
CUTLASS_DEVICE
void begin_epilogue() { }
CUTLASS_DEVICE
void begin_step(int step_idx) {}
CUTLASS_DEVICE
void begin_row(int row_idx) {}
CUTLASS_DEVICE
VisitAccessType visit(
int iter_idx,
int row_idx,
int column_idx,
int frag_idx,
AccumulatorAccessType const &accum
) {
// get pointer
thread_offset_row_ = thread_start_row_ + ThreadMap::iteration_offset(frag_idx).row();
ElementFragment broadcast_data = ElementFragment(*(broadcast_ptr + thread_offset_row_));
broadcast_fragment.fill(broadcast_data);
return broadcast_fragment;
}
CUTLASS_DEVICE
void end_row(int row_idx) { }
CUTLASS_DEVICE
void end_step(int step_idx) {
// run operator ++
++state_[0];
thread_start_row_ += ThreadMap::Shape::kRow;
if (state_[0] == ThreadMap::Count::kRow) {
state_[0] = 0;
++state_[1];
thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
if (state_[1] == ThreadMap::Count::kGroup) {
state_[1] = 0;
++state_[2];
thread_start_row_ += ThreadMap::Count::kGroup *
ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
if (state_[2] == ThreadMap::Count::kCluster) {
state_[2] = 0;
}
}
}
}
CUTLASS_DEVICE
void end_epilogue() { }
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,341 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A file contains the epilogue visitor Op with reduction over columns in CTA
*/
#pragma once
#include "cutlass/cutlass.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace epilogue {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Epilogue Visitor operator for the following computation:
///
/// ElementReductionAccumulator R[j] = \sum_i ElementReductionAccumulator(T[i][j])
/// device memory <- ElementReduction(R[j])
///
template <
typename ThreadblockShape_, /// Threadblock shape
typename ElementAccumulator_, ///< Data type of the Accumulator
typename ElementReduction_, ///< Data type of the output reduction in device memory
typename ElementReductionAccumulator_ , ///< Data type to accumulate reduction in smem and register
typename OutputTileIterator_, ///< Tile Iterator type
typename Visitor_ ///< preceding visitor op
>
class VisitorOpColumnReduction {
public:
using ElementAccumulator = ElementAccumulator_;
using ElementReductionAccumulator = ElementReductionAccumulator_;
using ElementReduction = ElementReduction_;
using OutputTileIterator = OutputTileIterator_;
using ThreadblockShape = ThreadblockShape_;
using Visitor = Visitor_;
static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
using ReductionOp = cutlass::plus<Array<ElementReductionAccumulator, kElementsPerAccess>>;
using ReductionOpScalar = cutlass::plus<ElementReductionAccumulator>;
using ElementOutput = typename OutputTileIterator::Element;
/// Fragment type returned from Visitor
using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
using ElementVisitor = typename VisitAccessTypeVisitor::Element;
using VisitAccessType = VisitAccessTypeVisitor;
/// Fragment type of accumulator
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
/// Fragment type of reduction
using ReductionAccumulatorAccessType = Array<ElementReductionAccumulator, kElementsPerAccess>;
/// Thread map used by output tile iterators
using ThreadMap = typename OutputTileIterator::ThreadMap;
/// Used for the reduction
struct ReductionDetail {
/// Number of threads per warp
static int const kWarpSize = 32;
/// Number of distinct scalar column indices handled by each thread
static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
/// Number of distinct scalar row indices handled by each thread
static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
/// Number of threads per threadblock
static int const kThreadCount = ThreadMap::kThreads;
/// Number of distinct threads per row of output tile
static int const kThreadsPerRow = ThreadblockShape::kN / kColumnsPerThread;
/// Number of distinct threads which must be reduced during the final reduction phase within the threadblock
static int const kThreadRows = kThreadCount / kThreadsPerRow;
/// Number of iterations (accesses) the threadblock takes to reduce a row
static int const kThreadAccessesPerRow = const_max(1, (ThreadblockShape::kN + kThreadCount - 1) / kThreadCount);
using StorageShape = MatrixShape<
kThreadRows,
ThreadblockShape::kN
>;
};
using ReductionFragment = Array<ElementReductionAccumulator, ReductionDetail::kColumnsPerThread>;
/// Shared storage
struct SharedStorage {
typename Visitor::SharedStorage storage_visitor;
AlignedArray<ElementReductionAccumulator, ReductionDetail::StorageShape::kCount, 16> reduction;
CUTLASS_HOST_DEVICE
SharedStorage() {}
};
/// Host-constructable Argument structure
struct Arguments {
ElementReduction *reduction_ptr; ///< Pointer to the reduction tensor in device memory
int64_t batch_stride;
typename Visitor::Arguments visitor_arg; ///< Argument type of visitor
/// Method
CUTLASS_HOST_DEVICE
Arguments(): reduction_ptr(nullptr) { }
CUTLASS_HOST_DEVICE
Arguments(
ElementReduction *reduction_ptr,
int64_t batch_stride,
typename Visitor::Arguments visitor_arg
):
reduction_ptr(reduction_ptr),
batch_stride(batch_stride),
visitor_arg(visitor_arg)
{ }
};
/// Param structure
struct Params {
ElementReduction *reduction_ptr; ///< Pointer to the reduction tensor in device memory
int64_t batch_stride;
typename Visitor::Params visitor_param; ///< Argument type of visitor
/// Method
CUTLASS_HOST_DEVICE
Params(): reduction_ptr(nullptr) { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args):
reduction_ptr(args.reduction_ptr),
batch_stride(args.batch_stride),
visitor_param(args.visitor_arg)
{ }
};
private:
ElementReduction *reduction_output_ptr_; ///< Pointer to the reduction tensor in device memory
ElementReductionAccumulator *reduction_smem_ptr_; ///< Pointer to the partial reductions in shared memory
ReductionFragment reduction_fragment; ///< register fragments that hold the partial reduction
Visitor visitor_; ///< visitor
int thread_idx_;
MatrixCoord threadblock_offset;
MatrixCoord problem_size_;
int64_t batch_stride_;
public:
/// Constructs the function object
CUTLASS_HOST_DEVICE
VisitorOpColumnReduction(
Params const &params,
SharedStorage &shared_storage,
int thread_idx,
MatrixCoord threadblock_offset,
MatrixCoord problem_size
):
visitor_(params.visitor_param, shared_storage.storage_visitor,
thread_idx, threadblock_offset, problem_size),
reduction_smem_ptr_(shared_storage.reduction.data()),
reduction_output_ptr_(params.reduction_ptr),
thread_idx_(thread_idx),
threadblock_offset(threadblock_offset),
problem_size_(problem_size),
batch_stride_(params.batch_stride)
{ }
CUTLASS_DEVICE
void set_batch_index(int batch_idx) {
reduction_output_ptr_ += batch_idx * batch_stride_;
visitor_.set_batch_index(batch_idx);
}
CUTLASS_DEVICE
void begin_epilogue() {
visitor_.begin_epilogue();
// clear the reduction fragment
reduction_fragment.clear();
}
CUTLASS_DEVICE
void begin_step(int step_idx) {
visitor_.begin_step(step_idx);
}
CUTLASS_DEVICE
void begin_row(int row_idx) {
visitor_.begin_row(row_idx);
}
CUTLASS_DEVICE
VisitAccessType visit(
int iter_idx,
int row_idx,
int column_idx,
int frag_idx,
AccumulatorAccessType const &accum
) {
/// Get result from visitor
VisitAccessTypeVisitor result = visitor_.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
NumericArrayConverter<ElementReductionAccumulator, ElementVisitor, kElementsPerAccess> reduction_converter;
ReductionOp reduction_op;
ReductionAccumulatorAccessType* reduction_fragment_ = reinterpret_cast<ReductionAccumulatorAccessType*>(&reduction_fragment);
reduction_fragment_[column_idx] = reduction_op(reduction_fragment_[column_idx], reduction_converter(result));
return result;
}
CUTLASS_DEVICE
void end_row(int row_idx) {
visitor_.end_row(row_idx);
}
CUTLASS_DEVICE
void end_step(int step_idx) {
visitor_.end_step(step_idx);
}
CUTLASS_DEVICE
void end_epilogue() {
visitor_.end_epilogue();
//
// Store the partially reduced value to SMEM
//
// Guard against uses of the existing SMEM tile
__syncthreads();
using AccessType = AlignedArray<ElementReductionAccumulator, ThreadMap::kElementsPerAccess>;
//
// Determine a compact thread arrangement to store to SMEM
//
MatrixCoord thread_offset(
thread_idx_ / ReductionDetail::kThreadsPerRow,
(thread_idx_ % ReductionDetail::kThreadsPerRow) * ThreadMap::kElementsPerAccess
);
//
// Each thread store its fragment to a SMEM
//
AccessType *aligned_reduction_ptr = reinterpret_cast<AccessType *>(
&reduction_smem_ptr_[thread_offset.row() * ThreadblockShape::kN + thread_offset.column()]
);
AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(
&reduction_fragment
);
CUTLASS_PRAGMA_UNROLL
for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
int col_idx = column * ThreadMap::Delta::kColumn / ThreadMap::kElementsPerAccess;
aligned_reduction_ptr[col_idx] = frag_ptr[column];
}
__syncthreads();
//
// Now, threads are assigned several columns of the output. The fetch over all rows from
// the compacted SMEM tile and perform a reduction.
//
NumericConverter<ElementReduction, ElementReductionAccumulator> output_converter;
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < ReductionDetail::kThreadAccessesPerRow; ++j) {
int column_idx = thread_idx_ + j * ReductionDetail::kThreadCount;
ReductionOpScalar reduction_op;
ElementReductionAccumulator reduction_element = ElementReductionAccumulator();
int output_column_idx = threadblock_offset.column() + column_idx;
if (column_idx < ThreadblockShape::kN && output_column_idx < problem_size_.column()) {
CUTLASS_PRAGMA_UNROLL
for (int row = 0; row < ReductionDetail::kThreadRows; ++row) {
if (row) {
auto frag = reduction_smem_ptr_[row * ThreadblockShape::kN + column_idx];
reduction_element = reduction_op(reduction_element, frag);
}
else {
reduction_element = reduction_smem_ptr_[column_idx];
}
}
// Store
reduction_output_ptr_[column_idx + threadblock_offset.column() + threadblock_offset.row() / ThreadblockShape::kM * problem_size_.column()] = output_converter(reduction_element);
}
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,266 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A file contains the epilogue visitor Op with Linear Combination
*/
#pragma once
#include "cutlass/cutlass.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace epilogue {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Epilogue Visitor operator for the following computation:
///
/// ElementCompute alpha;
/// ElementCompute beta;
/// ElementCompute C = BinaryOp(alpha * ElementCompute(Visitor_A), beta * ElementCompute(Visitor_B)
/// Return C;
///
template <
typename ElementAccumulator_, ///< Data type of the Accumulator
typename ElementCompute_, ///< Data type used to compute linear combination
int kElementsPerAccess_, ///< Number of elements computed per operation
typename VisitorA_, ///< Child node A
typename VisitorB_ ///< Child node B
>
class VisitorOpLinearCombination{
public:
using ElementAccumulator = ElementAccumulator_;
using ElementCompute = ElementCompute_;
static int const kElementsPerAccess = kElementsPerAccess_;
using VisitorA = VisitorA_;
using VisitorB = VisitorB_;
/// Fragment type returned from VisitorA.visit
using VisitAccessTypeA = typename VisitorA::VisitAccessType;
using ElementA = typename VisitAccessTypeA::Element;
/// Fragment type returned from VisitorB.visit
using VisitAccessTypeB = typename VisitorB::VisitAccessType;
using ElementB = typename VisitAccessTypeB::Element;
/// Fragment type returned by this visitor
using VisitAccessType = Array<ElementCompute, kElementsPerAccess>;
/// Fragment type of accumulator
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
/// Combination Op
using CombinationOp = cutlass::plus<VisitAccessType>;
static_assert(kElementsPerAccess==VisitAccessTypeA::kElements, "kElementsPerAccess mismatches with Visitor A");
static_assert(kElementsPerAccess==VisitAccessTypeB::kElements, "kElementsPerAccess mismatches with Visitor B");
/// SMEM buffer class required in the epilogue visitor
struct SharedStorage {
typename VisitorA::SharedStorage storage_a;
typename VisitorB::SharedStorage storage_b;
CUTLASS_HOST_DEVICE
SharedStorage() {}
};
/// Host-constructable Arguments structure
struct Arguments {
ElementCompute alpha; ///< scales accumulators
ElementCompute beta; ///< scales source tensor
typename VisitorA::Arguments visitor_a_arg; ///< Argument type for visitor_a
typename VisitorB::Arguments visitor_b_arg; ///< Argument type for visitor_b
//
// Methods
//
CUTLASS_HOST_DEVICE
Arguments():
alpha(ElementCompute(1)),
beta(ElementCompute(0))
{ }
CUTLASS_HOST_DEVICE
Arguments(
ElementCompute alpha,
ElementCompute beta,
typename VisitorA::Arguments visitor_a_arg,
typename VisitorB::Arguments visitor_b_arg
):
alpha(alpha),
beta(beta),
visitor_a_arg(visitor_a_arg),
visitor_b_arg(visitor_b_arg)
{ }
};
/// Parameter structure
struct Params {
ElementCompute alpha; ///< scales accumulators
ElementCompute beta; ///< scales source tensor
typename VisitorA::Params visitor_a_param; ///< Argument type for visitor_a
typename VisitorB::Params visitor_b_param; ///< Argument type for visitor_b
//
// Methods
//
CUTLASS_HOST_DEVICE
Params() { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args):
alpha(args.alpha),
beta(args.beta),
visitor_a_param(args.visitor_a_arg),
visitor_b_param(args.visitor_b_arg)
{ }
};
private:
//
// Data members
//
ElementCompute alpha_;
ElementCompute beta_;
VisitorA visitor_a_op;
VisitorB visitor_b_op;
public:
/// Constructs the function object
CUTLASS_HOST_DEVICE
VisitorOpLinearCombination(
Params const &params,
SharedStorage &shared_storage,
int thread_idx,
MatrixCoord threadblock_offset,
MatrixCoord problem_size
):
alpha_(params.alpha),
beta_(params.beta),
visitor_a_op(params.visitor_a_param, shared_storage.storage_a, thread_idx, threadblock_offset, problem_size),
visitor_b_op(params.visitor_b_param, shared_storage.storage_b, thread_idx, threadblock_offset, problem_size)
{ }
CUTLASS_DEVICE
void begin_epilogue() {
if (alpha_ != ElementCompute(0)) visitor_a_op.begin_epilogue();
if (beta_ != ElementCompute(0)) visitor_b_op.begin_epilogue();
}
CUTLASS_DEVICE
void begin_step(int step_idx) {
if (alpha_ != ElementCompute(0)) visitor_a_op.begin_step(step_idx);
if (beta_ != ElementCompute(0)) visitor_b_op.begin_step(step_idx);
}
CUTLASS_DEVICE
void begin_row(int row_idx) {
if (alpha_ != ElementCompute(0)) visitor_a_op.begin_row(row_idx);
if (beta_ != ElementCompute(0)) visitor_b_op.begin_row(row_idx);
}
CUTLASS_DEVICE
VisitAccessType visit(
int iter_idx,
int row_idx,
int column_idx,
int frag_idx,
AccumulatorAccessType const &accum
) {
/// Get result from visitor A and visitor B
VisitAccessTypeA result_A;
VisitAccessTypeB result_B;
if (alpha_ != ElementCompute(0)) {
result_A = visitor_a_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
} else {
// Fill the result A with zeros
result_A.clear();
}
if (beta_ != ElementCompute(0)) {
result_B = visitor_b_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
} else {
// Fill the result B with zeros
result_B.clear();
}
/// Type conversion
NumericArrayConverter<ElementCompute, ElementA, kElementsPerAccess> source_converter_A;
NumericArrayConverter<ElementCompute, ElementB, kElementsPerAccess> source_converter_B;
CombinationOp combination_op;
cutlass::multiplies<VisitAccessType> multiply_op;
return combination_op(
multiply_op(alpha_, source_converter_A(result_A)),
multiply_op(beta_, source_converter_B(result_B))
);
}
CUTLASS_DEVICE
void end_row(int row_idx) {
if (alpha_ != ElementCompute(0)) visitor_a_op.end_row(row_idx);
if (beta_ != ElementCompute(0)) visitor_b_op.end_row(row_idx);
}
CUTLASS_DEVICE
void end_step(int step_idx) {
if (alpha_ != ElementCompute(0)) visitor_a_op.end_step(step_idx);
if (beta_ != ElementCompute(0)) visitor_b_op.end_step(step_idx);
}
CUTLASS_DEVICE
void end_epilogue() {
if (alpha_ != ElementCompute(0)) visitor_a_op.end_epilogue();
if (beta_ != ElementCompute(0)) visitor_b_op.end_epilogue();
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,258 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A file contains the epilogue visitor Op with broadcasting vector to all rows
*/
#pragma once
#include "cutlass/cutlass.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace epilogue {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Epilogue Visitor operator for the following computation:
///
/// ElementVector T[i][j] <- device-memory Td[j]
///
/// It can only be a leaf node in the epilogue tree
template <
typename ElementAccumulator_, ///< Data type of the Accumulator
typename ElementFragment_, ///< Data type used to cache vector in register
typename InputTileIterator_ ///< Tile iterator type to read the broadcasted tensor
>
class VisitorOpRowBroadcast {
public:
using InputTileIterator = InputTileIterator_;
static int const kElementsPerAccess = InputTileIterator::kElementsPerAccess;
using ElementAccumulator = ElementAccumulator_;
using ElementVector = typename InputTileIterator::Element;
using ElementFragment = ElementFragment_;
using VisitAccessType = Array<ElementFragment, kElementsPerAccess>;
/// Thread map used by input tile iterators
using ThreadMap = typename InputTileIterator::ThreadMap;
/// Fragment object used to store the broadcast values
using BroadcastFragment = Array<
ElementFragment,
ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
/// Fragment type of accumulator
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
/// Used for the broadcast
struct BroadcastDetail {
/// Number of threads per warp
static int const kWarpSize = 32;
static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
/// Number of distinct scalar column indices handled by each thread
static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
/// Number of distinct scalar row indices handled by each thread
static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
/// Number of threads per threadblock
static int const kThreadCount = ThreadMap::kThreads;
/// Number of distinct threads per row of output tile
static int const kThreadsPerRow = (InputTileIterator::Shape::kN / kColumnsPerThread);
/// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
static int const kThreadRows = kThreadCount / kThreadsPerRow;
// /// Number of iterations (accesses) the threadblock takes to reduce a row
// static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
};
// using ComputeFragmentType = Array<ElementVector, BroadcastDetail::kElementsPerAccess>;
struct SharedStorage {
CUTLASS_HOST_DEVICE
SharedStorage() { }
};
/// Host-constructable Argument structure
struct Arguments {
ElementVector *broadcast_ptr; ///< Pointer to the additional tensor operand
int64_t batch_stride;
/// Methods
CUTLASS_HOST_DEVICE
Arguments():
broadcast_ptr(nullptr) { }
CUTLASS_HOST_DEVICE
Arguments(
ElementVector *broadcast_ptr,
int64_t batch_stride
):
broadcast_ptr(broadcast_ptr),
batch_stride(batch_stride) { }
};
/// Param structure
struct Params {
ElementVector *broadcast_ptr; ///< Pointer to the additional tensor operand
int64_t batch_stride;
/// Method
CUTLASS_HOST_DEVICE
Params():
broadcast_ptr(nullptr) { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args):
broadcast_ptr(args.broadcast_ptr),
batch_stride(args.batch_stride) { }
};
private:
ElementVector *broadcast_ptr;
BroadcastFragment broadcast_fragment; ///< Array holds the loaded broadcast fragment
MatrixCoord threadblock_offset_;
int thread_idx_;
MatrixCoord problem_size;
int64_t batch_stride_;
public:
/// Constructs the function object
CUTLASS_HOST_DEVICE
VisitorOpRowBroadcast(
Params const &params,
SharedStorage &shared_storage,
int thread_idx,
MatrixCoord threadblock_offset,
MatrixCoord problem_size
):
broadcast_ptr(params.broadcast_ptr + threadblock_offset.column()),
threadblock_offset_(threadblock_offset),
thread_idx_(thread_idx),
problem_size(problem_size),
batch_stride_(params.batch_stride) { }
CUTLASS_DEVICE
void set_batch_index(int batch_idx) {
broadcast_ptr += batch_idx * batch_stride_;
}
CUTLASS_DEVICE
void begin_epilogue() {
// load broadcast fragment
load_broadcast_fragment_();
}
CUTLASS_DEVICE
void begin_step(int step_idx) {}
CUTLASS_DEVICE
void begin_row(int row_idx) {}
CUTLASS_DEVICE
VisitAccessType visit(
int iter_idx,
int row_idx,
int column_idx,
int frag_idx,
AccumulatorAccessType const &accum
) {
VisitAccessType* broadcast_fragment_ = reinterpret_cast<VisitAccessType*>(&broadcast_fragment);
return broadcast_fragment_[column_idx];
}
CUTLASS_DEVICE
void end_row(int row_idx) { }
CUTLASS_DEVICE
void end_step(int step_idx) { }
CUTLASS_DEVICE
void end_epilogue() { }
private:
CUTLASS_DEVICE
void load_broadcast_fragment_() {
broadcast_fragment.clear();
// If no pointer is supplied, set with all zeros and avoid memory accesses
if (!broadcast_ptr) {
return;
}
int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
int thread_column_idx = threadblock_offset_.column() + thread_initial_column;
broadcast_ptr += thread_initial_column;
NumericArrayConverter<ElementFragment, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
using AccessFragmentType = Array<ElementFragment, BroadcastDetail::kElementsPerAccess>;
AccessFragmentType *frag_ptr = reinterpret_cast<AccessFragmentType *>(&broadcast_fragment);
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
AccessType loaded;
loaded.clear();
if (thread_column_idx < problem_size.column()) {
loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
}
AccessFragmentType cvt = converter(loaded);
frag_ptr[j] = cvt;
thread_column_idx += ThreadMap::Delta::kColumn;
broadcast_ptr += ThreadMap::Delta::kColumn;
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,319 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A file contains the epilogue visitor Op with reduction over rows in CTA
*/
#pragma once
#include "cutlass/cutlass.h"
#include "stdio.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace epilogue {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Epilogue Visitor operator for the following computation:
///
/// ElementReductionAccumulator R[i] = \sum_i ElementReductionAccumulator(T[i][j])
/// device memory <- ElementReduction(R[i])
///
template <
typename ThreadblockShape_, /// Threadblock shape
typename ElementAccumulator_, ///< Data type of the Accumulator
typename ElementReduction_, ///< Data type of the output reduction in device memory
typename ElementReductionAccumulator_ , ///< Data type to accumulate reduction in smem and register
typename OutputTileIterator_, ///< Tile Iterator type
typename Visitor_ ///< preceding visitor op
>
class VisitorOpRowReduction {
public:
using ElementAccumulator = ElementAccumulator_;
using ElementReductionAccumulator = ElementReductionAccumulator_;
using ElementReduction = ElementReduction_;
using OutputTileIterator = OutputTileIterator_;
using ThreadblockShape = ThreadblockShape_;
using Visitor = Visitor_;
static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
using ReductionOp = cutlass::plus<Array<ElementReductionAccumulator, kElementsPerAccess>>;
using ReductionOpScalar = cutlass::plus<ElementReductionAccumulator>;
using ElementOutput = typename OutputTileIterator::Element;
/// Fragment type returned from Visitor
using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
using ElementVisitor = typename VisitAccessTypeVisitor::Element;
using VisitAccessType = VisitAccessTypeVisitor;
/// Fragment type of accumulator
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
/// Fragment type of reduction
using ReductionAccumulatorAccessType = Array<ElementReductionAccumulator, kElementsPerAccess>;
/// Thread map used by output tile iterators
using ThreadMap = typename OutputTileIterator::ThreadMap;
/// Used for the reduction
struct ReductionDetail {
/// Number of threads per warp
static int const kWarpSize = 32;
/// Number of distinct scalar column indices handled by each thread
static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
/// Number of distinct scalar row indices handled by each thread
static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
/// Number of threads per threadblock
static int const kThreadCount = ThreadMap::kThreads;
/// Number of distinct threads per row of output tile
static int const kThreadsPerRow = ThreadblockShape::kN / kColumnsPerThread;
/// Half number of threads per row used for cross-thread reduction
static int const kHalfThreadsPerRow = (kThreadsPerRow >> 1);
/// Number of distinct threads which must be reduced during the final reduction phase within the threadblock
static int const kThreadRows = kThreadCount / kThreadsPerRow;
};
/// Shared storage
struct SharedStorage {
typename Visitor::SharedStorage storage_visitor;
CUTLASS_HOST_DEVICE
SharedStorage() { }
};
/// Host-constructable Argument structure
struct Arguments {
ElementReduction *reduction_ptr; ///< Pointer to the reduction tensor in device memory
int64_t batch_stride;
typename Visitor::Arguments visitor_arg; ///< Argument type of visitor
/// Method
CUTLASS_HOST_DEVICE
Arguments(): reduction_ptr(nullptr) { }
CUTLASS_HOST_DEVICE
Arguments(
ElementReduction *reduction_ptr,
int64_t batch_stride,
typename Visitor::Arguments visitor_arg
):
reduction_ptr(reduction_ptr),
batch_stride(batch_stride),
visitor_arg(visitor_arg)
{ }
};
/// Param structure
struct Params {
ElementReduction *reduction_ptr; ///< Pointer to the reduction tensor in device memory
int64_t batch_stride;
typename Visitor::Params visitor_param; ///< Argument type of visitor
/// Method
CUTLASS_HOST_DEVICE
Params(): reduction_ptr(nullptr) { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args):
reduction_ptr(args.reduction_ptr),
batch_stride(args.batch_stride),
visitor_param(args.visitor_arg)
{ }
};
private:
ElementReduction *reduction_output_ptr_; ///< Pointer to the reduction tensor in device memory
ElementReductionAccumulator reduction_accum;
Visitor visitor_; ///< visitor
int thread_idx_;
MatrixCoord threadblock_offset;
MatrixCoord problem_size_;
int thread_start_row_; /// used to identify
int state_[3]; /// used to track row iterator
int thread_offset_row_;
int64_t batch_stride_;
public:
/// Constructs the function object
CUTLASS_HOST_DEVICE
VisitorOpRowReduction(
Params const &params,
SharedStorage &shared_storage,
int thread_idx,
MatrixCoord threadblock_offset,
MatrixCoord problem_size
):
visitor_(params.visitor_param, shared_storage.storage_visitor,
thread_idx, threadblock_offset, problem_size),
reduction_output_ptr_(params.reduction_ptr),
thread_idx_(thread_idx),
threadblock_offset(threadblock_offset),
problem_size_(problem_size),
thread_start_row_(ThreadMap::initial_offset(thread_idx).row() + threadblock_offset.row()),
batch_stride_(params.batch_stride)
{
state_[0] = state_[1] = state_[2] = 0;
}
CUTLASS_DEVICE
void set_batch_index(int batch_idx) {
reduction_output_ptr_ += batch_idx * batch_stride_;
visitor_.set_batch_index(batch_idx);
}
CUTLASS_DEVICE
void begin_epilogue() {
visitor_.begin_epilogue();
}
CUTLASS_DEVICE
void begin_step(int step_idx) {
visitor_.begin_step(step_idx);
}
CUTLASS_DEVICE
void begin_row(int row_idx) {
visitor_.begin_row(row_idx);
reduction_accum = ElementReductionAccumulator(0);
}
CUTLASS_DEVICE
VisitAccessType visit(
int iter_idx,
int row_idx,
int column_idx,
int frag_idx,
AccumulatorAccessType const &accum
) {
/// Get result from visitor
VisitAccessTypeVisitor result = visitor_.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
thread_offset_row_ = thread_start_row_ + ThreadMap::iteration_offset(frag_idx).row();
ReductionOpScalar reduction_op;
ElementReductionAccumulator reduction_accum_ = reduction(result);
// After performing the in-thread reduction, we then perform cross-thread / in-warp reduction
CUTLASS_PRAGMA_UNROLL
for (int i = ReductionDetail::kHalfThreadsPerRow; i > 0; i >>= 1) {
reduction_accum_ = reduction_op(reduction_accum_, __shfl_xor_sync(0xFFFFFFFF, reduction_accum_, i));
}
reduction_accum = reduction_op(reduction_accum, reduction_accum_);
return result;
}
CUTLASS_DEVICE
void end_row(int row_idx) {
visitor_.end_row(row_idx);
NumericConverter<ElementReduction, ElementReductionAccumulator> output_converter;
bool is_write_thread = (thread_offset_row_ < problem_size_.row() && (thread_idx_ % ReductionDetail::kThreadsPerRow) == 0);
int row_offset = thread_offset_row_ + threadblock_offset.column() / ThreadblockShape::kN * problem_size_.row();
ElementReduction *curr_ptr_reduction = reduction_output_ptr_ + row_offset;
arch::global_store<ElementReduction, sizeof(ElementReduction)>(
output_converter(reduction_accum),
(void *)curr_ptr_reduction,
is_write_thread);
}
CUTLASS_DEVICE
void end_step(int step_idx) {
visitor_.end_step(step_idx);
// run operator ++
++state_[0];
thread_start_row_ += ThreadMap::Shape::kRow;
if (state_[0] == ThreadMap::Count::kRow) {
state_[0] = 0;
++state_[1];
thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
if (state_[1] == ThreadMap::Count::kGroup) {
state_[1] = 0;
++state_[2];
thread_start_row_ += ThreadMap::Count::kGroup *
ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
if (state_[2] == ThreadMap::Count::kCluster) {
state_[2] = 0;
}
}
}
}
CUTLASS_DEVICE
void end_epilogue() {
visitor_.end_epilogue();
}
private:
CUTLASS_DEVICE
ElementReductionAccumulator reduction(VisitAccessTypeVisitor const& result) {
ElementReductionAccumulator sum_ = ElementReductionAccumulator(0);
ReductionOpScalar reduction_op;
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < VisitAccessTypeVisitor::kElements; ++i) {
sum_ = reduction_op(sum_, result[i]);
}
return sum_;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,188 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A file contains the epilogue visitor Op with Tensor Output
*/
#pragma once
#include "cutlass/cutlass.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace epilogue {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Epilogue Visitor operator for the following computation:
///
/// ElementInput C <- device memory
///
/// It can only be a leaf node in the epilogue tree
template <
typename ElementAccumulator_, ///< Data type of the Accumulator
typename InputTileIterator_ ///< Tile iterator type to read the tensor
>
class VisitorOpTensorInput {
public:
using ElementAccumulator = ElementAccumulator_;
using InputTileIterator = InputTileIterator_;
static int const kElementsPerAccess = InputTileIterator::kElementsPerAccess;
using ElementInput = typename InputTileIterator::Element;
using VisitAccessType = Array<ElementInput, kElementsPerAccess>;
/// Fragment type of accumulator
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
struct SharedStorage {
CUTLASS_HOST_DEVICE
SharedStorage() { }
};
/// Host-constructable Argument structure
struct Arguments {
ElementInput *input_ptr; ///< Pointer to the input tensor in device memory
int ldt; ///< Leading dimension of the input tensor operand
int64_t batch_stride; ///< batch stride for batched GEMM
/// Methods
CUTLASS_HOST_DEVICE
Arguments(): input_ptr(nullptr) { }
CUTLASS_HOST_DEVICE
Arguments(
ElementInput *input_ptr,
int ldt, int64_t batch_stride
):
input_ptr(input_ptr),
ldt(ldt),
batch_stride(batch_stride)
{ }
};
/// Param structure
struct Params {
typename InputTileIterator::Params params_input;
ElementInput *input_ptr;
int64_t batch_stride;
/// Method
CUTLASS_HOST_DEVICE
Params():
input_ptr(nullptr) { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args):
params_input(args.ldt),
input_ptr(args.input_ptr),
batch_stride(args.batch_stride)
{ }
};
private:
InputTileIterator iterator_T_;
typename InputTileIterator::Fragment fragment_T_;
MatrixCoord problem_size;
int64_t batch_stride_;
public:
/// Constructs the function object
CUTLASS_HOST_DEVICE
VisitorOpTensorInput(
Params const &params,
SharedStorage &shared_storage,
int thread_idx,
MatrixCoord threadblock_offset,
MatrixCoord problem_size
):
iterator_T_(
InputTileIterator(
params.params_input,
params.input_ptr,
problem_size,
thread_idx,
threadblock_offset
)
),
problem_size(problem_size),
batch_stride_(params.batch_stride) { }
CUTLASS_DEVICE
void set_batch_index(int batch_idx) {
iterator_T_.add_pointer_offset(batch_idx * batch_stride_);
}
CUTLASS_DEVICE
void begin_epilogue() { }
CUTLASS_DEVICE
void begin_step(int step_idx) {
fragment_T_.clear();
iterator_T_.load(fragment_T_);
++iterator_T_;
}
CUTLASS_DEVICE
void begin_row(int row_idx) { }
CUTLASS_DEVICE
VisitAccessType visit(
int iter_idx,
int row_idx,
int column_idx,
int frag_idx,
AccumulatorAccessType const &accum
) {
VisitAccessType source = reinterpret_cast<VisitAccessType *>(&fragment_T_)[frag_idx];
return source;
}
CUTLASS_DEVICE
void end_row(int row_idx) { }
CUTLASS_DEVICE
void end_step(int step_idx) { }
CUTLASS_DEVICE
void end_epilogue() { }
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,240 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A file contains the epilogue visitor Op with Tensor Output
*/
#pragma once
#include "cutlass/cutlass.h"
#include "stdio.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace epilogue {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Epilogue Visitor operator for the following computation:
///
/// ElementOutput T = ElementOutput(Visitor)
/// T-> device memory
///
template <
typename ElementAccumulator_, ///< Data type of the Accumulator
typename OutputTileIterator_, ///< Tile iterator type to write the tensor
typename Visitor_ ///< Child visitor that produces the output tensor
>
class VisitorOpTensorOutput {
public:
using ElementAccumulator = ElementAccumulator_;
using OutputTileIterator = OutputTileIterator_;
static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
using ElementOutput = typename OutputTileIterator::Element;
using Visitor = Visitor_;
/// Fragment type returned from Visitor
using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
using ElementVisitor = typename VisitAccessTypeVisitor::Element;
using VisitAccessType = VisitAccessTypeVisitor;
/// Fragment type of accumulator
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
/// Fragment type of output
using OutputAccessType = Array<ElementOutput, kElementsPerAccess>;
static_assert(kElementsPerAccess==VisitAccessTypeVisitor::kElements, "kElementsPerAccess mismatches with Visitor");
struct SharedStorage {
typename Visitor::SharedStorage storage_visitor;
CUTLASS_HOST_DEVICE
SharedStorage() { }
};
/// Host-constructable Argument structure
struct Arguments {
ElementOutput *output_ptr; ///< Pointer to the output tensor in device memory
int ldt; ///< Leading dimension of the output tensor operand
int64_t batch_stride; ///< batch stride
typename Visitor::Arguments visitor_arg; ///< Argument type of visitor
/// Methods
CUTLASS_HOST_DEVICE
Arguments(): output_ptr(nullptr) { }
CUTLASS_HOST_DEVICE
Arguments(
ElementOutput *output_ptr,
int ldt,
int64_t batch_stride,
typename Visitor::Arguments visitor_arg
):
output_ptr(output_ptr),
ldt(ldt),
batch_stride(batch_stride),
visitor_arg(visitor_arg)
{ }
};
/// Param structure
struct Params {
typename OutputTileIterator::Params params_output;
ElementOutput *output_ptr;
int64_t batch_stride;
typename Visitor::Params visitor_param;
/// Method
CUTLASS_HOST_DEVICE
Params():
output_ptr(nullptr) { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args):
params_output(args.ldt),
output_ptr(args.output_ptr),
batch_stride(args.batch_stride),
visitor_param(args.visitor_arg)
{ }
};
private:
OutputTileIterator iterator_T_;
typename OutputTileIterator::Fragment fragment_T_;
MatrixCoord problem_size;
Visitor visitor_;
int64_t batch_stride_;
public:
/// Constructs the function object
CUTLASS_HOST_DEVICE
VisitorOpTensorOutput(
Params const &params,
SharedStorage &shared_storage,
int thread_idx,
MatrixCoord threadblock_offset,
MatrixCoord problem_size
):
visitor_(params.visitor_param, shared_storage.storage_visitor, thread_idx, threadblock_offset, problem_size),
iterator_T_(
OutputTileIterator(
params.params_output,
params.output_ptr,
problem_size,
thread_idx,
threadblock_offset
)
),
problem_size(problem_size),
batch_stride_(params.batch_stride) { }
CUTLASS_DEVICE
void set_batch_index(int batch_idx) {
iterator_T_.add_pointer_offset(batch_idx * batch_stride_);
visitor_.set_batch_index(batch_idx);
}
CUTLASS_DEVICE
void begin_epilogue() {
visitor_.begin_epilogue();
}
CUTLASS_DEVICE
void begin_step(int step_idx) {
fragment_T_.clear();
visitor_.begin_step(step_idx);
}
CUTLASS_DEVICE
void begin_row(int row_idx) {
visitor_.begin_row(row_idx);
}
CUTLASS_DEVICE
VisitAccessType visit(
int iter_idx,
int row_idx,
int column_idx,
int frag_idx,
AccumulatorAccessType const &accum
) {
/// Get result from visitor
VisitAccessTypeVisitor result = visitor_.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
// Column guard
MatrixCoord thread_offset_ = iterator_T_.thread_start() + OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
bool column_guard = (thread_offset_.column() < problem_size.column());
if (column_guard) {
NumericArrayConverter<ElementOutput, ElementVisitor, kElementsPerAccess> output_converter;
OutputAccessType &output = reinterpret_cast<OutputAccessType *>(&fragment_T_)[frag_idx];
output = output_converter(result);
}
return result;
}
CUTLASS_DEVICE
void end_row(int row_idx) {
visitor_.end_row(row_idx);
}
CUTLASS_DEVICE
void end_step(int step_idx) {
visitor_.end_step(step_idx);
iterator_T_.store(fragment_T_);
++iterator_T_;
}
CUTLASS_DEVICE
void end_epilogue() {
visitor_.end_epilogue();
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,226 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief A file contains the epilogue visitor Op with Unary operation
*/
#pragma once
#include "cutlass/cutlass.h"
#include "unary_ops.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace epilogue {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
/// Epilogue Visitor operator for the following computation:
///
/// ElementCompute alpha;
/// ElementCompute beta;
/// ElementCompute C = UnaryOp(ElementCompute(Visitor))
/// Return C;
///
template <
typename ElementAccumulator_, ///< Data type of the Accumulator
typename ElementCompute_, ///< Data type used to compute linear combination
int kElementsPerAccess_, ///< Number of elements computed per operation
typename Visitor_, ///< Child node
template<typename T, int N> typename UnaryOp_
>
class VisitorOpUnary{
public:
using ElementAccumulator = ElementAccumulator_;
using ElementCompute = ElementCompute_;
static int const kElementsPerAccess = kElementsPerAccess_;
using Visitor = Visitor_;
/// Fragment type returned from Visitor.visit
using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
using ElementVisit = typename VisitAccessTypeVisitor::Element;
/// Fragment type returned by this visitor
using VisitAccessType = Array<ElementCompute, kElementsPerAccess>;
/// Fragment type of accumulator
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
/// Combination Op
using UnaryOp = UnaryOp_<ElementCompute, kElementsPerAccess>;
static_assert(kElementsPerAccess==VisitAccessTypeVisitor::kElements, "kElementsPerAccess mismatches with Visitor");
/// SMEM buffer class required in the epilogue visitor
struct SharedStorage {
typename Visitor::SharedStorage storage_visitor;
CUTLASS_HOST_DEVICE
SharedStorage() {}
};
/// Host-constructable Arguments structure
struct Arguments {
typename UnaryOp::Arguments unary_arg;
typename Visitor::Arguments visitor_arg; ///< Argument type for visitor
//
// Methods
//
CUTLASS_HOST_DEVICE
Arguments():unary_arg() { }
CUTLASS_HOST_DEVICE
Arguments(
typename UnaryOp::Arguments unary_arg,
typename Visitor::Arguments visitor_arg
):
unary_arg(unary_arg),
visitor_arg(visitor_arg)
{ }
};
/// Parameter structure
struct Params {
typename UnaryOp::Params unary_param;
typename Visitor::Params visitor_param; ///< Argument type for visitor
//
// Methods
//
CUTLASS_HOST_DEVICE
Params():unary_param() { }
CUTLASS_HOST_DEVICE
Params(Arguments const &args):
unary_param(args.unary_arg),
visitor_param(args.visitor_arg)
{ }
};
private:
//
// Data members
//
UnaryOp unary_op;
Visitor visitor_op;
public:
/// Constructs the function object
CUTLASS_HOST_DEVICE
VisitorOpUnary(
Params const &params,
SharedStorage &shared_storage,
int thread_idx,
MatrixCoord threadblock_offset,
MatrixCoord problem_size
):
unary_op(params.unary_param),
visitor_op(params.visitor_param, shared_storage.storage_visitor, thread_idx, threadblock_offset, problem_size)
{ }
CUTLASS_DEVICE
void set_batch_index(int batch_idx) {
visitor_op.set_batch_index(batch_idx);
}
CUTLASS_DEVICE
void begin_epilogue() {
if (unary_op.guard()) visitor_op.begin_epilogue();
}
CUTLASS_DEVICE
void begin_step(int step_idx) {
if (unary_op.guard()) visitor_op.begin_step(step_idx);
}
CUTLASS_DEVICE
void begin_row(int row_idx) {
if (unary_op.guard()) visitor_op.begin_row(row_idx);
}
CUTLASS_DEVICE
VisitAccessType visit(
int iter_idx,
int row_idx,
int column_idx,
int frag_idx,
AccumulatorAccessType const &accum
) {
/// Get result from visitor A and visitor B
VisitAccessTypeVisitor result;
if (unary_op.guard()){
result = visitor_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
} else {
result.clear();
}
/// Type conversion
NumericArrayConverter<ElementCompute, ElementVisit, kElementsPerAccess> source_converter;
cutlass::multiplies<VisitAccessType> multiply_op;
return unary_op(source_converter(result));
}
CUTLASS_DEVICE
void end_row(int row_idx) {
if (unary_op.guard()) visitor_op.end_row(row_idx);
}
CUTLASS_DEVICE
void end_step(int step_idx) {
if (unary_op.guard()) visitor_op.end_step(step_idx);
}
CUTLASS_DEVICE
void end_epilogue() {
if (unary_op.guard()) visitor_op.end_epilogue();
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,480 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this layernormware without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Epilogue visitor type used for partial computation of a layernorm operation
GemmLayernorm example = GEMM0 with partial reduction fused in epilogue (EpilogueVisitorLayerNorm)
+ lightweight full reduction kernel (ApplyFinalReduction)
+ GEMM1 with elementwise operations fused in mainloop (GemmLayernormMainloopFusion)
*/
#pragma once
/////////////////////////////////////////////////////////////////////////////////////////////////
#include "cutlass/cutlass.h"
#include "cutlass/arch/memory.h"
#include "cutlass/arch/memory_sm75.h"
#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
#include "cutlass/gemm/kernel/default_gemm.h"
#include "cutlass/gemm/kernel/default_gemm_complex.h"
#include "cutlass/gemm/device/default_gemm_configuration.h"
#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace epilogue {
namespace threadblock {
/////////////////////////////////////////////////////////////////////////////////////////////////
template <
typename ThreadblockShape_,
int ThreadCount,
typename OutputTileIterator_,
typename AccumulatorTile_,
typename ElementAccumulator_,
typename ElementVariance_,
typename ElementMean_,
typename ElementLayernormCompute_,
typename ElementwiseFunctor_,
bool IsShiftedVariance_ = false
>
class EpilogueVisitorLayerNorm {
public:
using ElementVariance = ElementVariance_;
using ElementMean = ElementMean_;
using ElementLayernormCompute = ElementLayernormCompute_;
using AccumulatorTile = AccumulatorTile_;
using ThreadblockShape = ThreadblockShape_;
static int const kThreadCount = ThreadCount;
using OutputTileIterator = OutputTileIterator_;
using ElementwiseFunctor = ElementwiseFunctor_;
static int const kIterations = OutputTileIterator::kIterations;
static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
static int const kRowIterations = OutputTileIterator::ThreadMap::Iterations::kRow;
static int const kThreads = OutputTileIterator::ThreadMap::kThreads;
static bool const kIsShiftedVariance = IsShiftedVariance_;
using ElementOutput = typename OutputTileIterator::Element;
static int const kDeltaRow = OutputTileIterator::ThreadMap::Delta::kRow;
/// Array type used in Shift-K Layernorm
static int const kRowAccessCount = kIterations * kRowIterations;
using ConvertedShiftFragment = Array<ElementLayernormCompute, kRowAccessCount>;
// Conducts manual transpose externally (already supported) for column major
using LayoutOutput = cutlass::layout::RowMajor;
using ElementAccumulator = ElementAccumulator_;
using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
using LayernormFragment = Array<ElementLayernormCompute, kElementsPerAccess>;
using OutputVector = Array<ElementOutput, kElementsPerAccess>;
using TensorRefD = TensorRef<ElementOutput, LayoutOutput>;
static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
static int const kThreadsInColumn = kThreads / kThreadsPerRow;
static int const kHalfThreadsPerRow = (kThreadsPerRow >> 1);
/// Argument structure
struct Arguments {
typename ElementwiseFunctor::Params elementwise;
ElementVariance *ptr_Variance;
ElementMean *ptr_Mean;
ElementOutput *ptr_Shifted_K;
MatrixCoord extent;
//
// Methods
//
Arguments():
ptr_Variance(nullptr),
ptr_Mean(nullptr),
ptr_Shifted_K(nullptr)
{
}
Arguments(
typename ElementwiseFunctor::Params elementwise_,
ElementVariance *ptr_Variance,
ElementMean *ptr_Mean_,
ElementOutput *ptr_Shifted_K_ = nullptr,
MatrixCoord extent = MatrixCoord(0, 0)
):
elementwise(elementwise_),
ptr_Variance(ptr_Variance),
ptr_Mean(ptr_Mean_),
ptr_Shifted_K(ptr_Shifted_K_),
extent(extent)
{
}
};
struct Params {
typename ElementwiseFunctor::Params elementwise;
ElementVariance *ptr_Variance;
ElementMean *ptr_Mean;
ElementOutput *ptr_Shifted_K;
MatrixCoord extent;
//
// Methods
//
CUTLASS_HOST_DEVICE
Params():
ptr_Variance(nullptr),
ptr_Mean(nullptr)
{
}
CUTLASS_HOST_DEVICE
Params(Arguments const &args):
elementwise(args.elementwise),
ptr_Variance(args.ptr_Variance),
ptr_Mean(args.ptr_Mean),
ptr_Shifted_K(args.ptr_Shifted_K),
extent(args.extent)
{
}
};
/// Shared storage
struct SharedStorage {
};
private:
Params const & params_;
SharedStorage & shared_storage_;
MatrixCoord extent_;
ElementwiseFunctor elementwise_;
OutputTileIterator iterator_C_;
OutputTileIterator iterator_D_;
typename OutputTileIterator::Fragment fragment_C_;
typename OutputTileIterator::Fragment fragment_D_;
ElementAccumulator alpha_;
ElementAccumulator beta_;
ConvertedShiftFragment shift_k_frag_;
ElementLayernormCompute accum_sum_square_;
ElementLayernormCompute accum_sum_element_;
int thread_idx_;
MatrixCoord thread_offset_;
gemm::GemmCoord threadblock_tile_offset_;
public:
CUTLASS_DEVICE
EpilogueVisitorLayerNorm(
Params const &params, ///< Parameters routed to the epilogue
SharedStorage &shared_storage, ///< Shared storage needed by the functors here
MatrixCoord threadblock_offset,
gemm::GemmCoord threadblock_tile_offset,
int thread_idx,
OutputTileIterator destination_iterator, ///< Tile iterator for destination
OutputTileIterator source_iterator ///< Threadblock tile coordinate in GEMMM
):
params_(params),
shared_storage_(shared_storage),
elementwise_(params.elementwise),
extent_(params.extent),
iterator_C_(source_iterator),
iterator_D_(destination_iterator),
threadblock_tile_offset_(threadblock_tile_offset),
thread_idx_(thread_idx)
{
alpha_ = (params.elementwise.alpha_ptr ? *params.elementwise.alpha_ptr : params.elementwise.alpha);
beta_ = (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
if (beta_ == ElementAccumulator()) {
iterator_C_.clear_mask();
}
}
/// Helper to indicate split-K behavior
CUTLASS_DEVICE
void set_k_partition(
int split_k_index, ///< Index of this threadblock within split-K partitioned scheme
int split_k_slices) { ///< Total number of split-K slices
}
/// Called to set the batch index
CUTLASS_DEVICE
void set_batch_index(int batch_idx) {
}
/// Called at the start of the epilogue just before iterating over accumulator slices
CUTLASS_DEVICE
void begin_epilogue() {
// If shift-K feature is enabled, we load shift-k fragment
// at the very beginning of an epilogue
if (kIsShiftedVariance && params_.ptr_Shifted_K != nullptr) {
shift_k_frag_.clear();
int thread_offset_row_base = iterator_D_.thread_start_row();
CUTLASS_PRAGMA_UNROLL
for (int iter_idx = 0; iter_idx < kIterations; ++iter_idx) {
int step_offset = iter_idx * OutputTileIterator::Shape::kRow;
CUTLASS_PRAGMA_UNROLL
for (int rid = 0; rid < kRowIterations; ++rid) {
int row_step_offset = rid * kDeltaRow;
int row_offset = thread_offset_row_base + step_offset + row_step_offset;
bool is_load = (row_offset < extent_.row());
shift_k_frag_[iter_idx * kRowIterations + rid] = load_shift_k_(row_offset, is_load);
}
}
}
}
/// Called at the start of one step before starting accumulator exchange
CUTLASS_DEVICE
void begin_step(int step_idx) {
fragment_D_.clear();
if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
fragment_C_.clear();
iterator_C_.load(fragment_C_);
++iterator_C_;
}
}
/// Called at the start of a row
CUTLASS_DEVICE
void begin_row(int row_idx) {
/// set the accumulator to 0
accum_sum_element_ = ElementLayernormCompute(0);
accum_sum_square_ = ElementLayernormCompute(0);
}
/// Called after accumulators have been exchanged for each accumulator vector
CUTLASS_DEVICE
void visit(
int iter_idx,
int row_idx,
int column_idx,
int frag_idx,
AccumulatorFragment const &accum) {
using Mul = cutlass::multiplies<ElementLayernormCompute>;
using Minus = cutlass::minus<ElementLayernormCompute>;
using Exp = cutlass::fast_exp_op<ElementLayernormCompute>;
Minus minus;
Mul mul;
Exp exponential;
LayernormFragment result;
thread_offset_ =
iterator_D_.thread_start() +
OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
NumericArrayConverter<ElementLayernormCompute, ElementOutput, kElementsPerAccess> source_converter;
OutputVector &source_vector = reinterpret_cast<OutputVector *>(&fragment_C_)[frag_idx];
bool column_guard = (thread_offset_.column() < extent_.column());
if (elementwise_.kScale == cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
result = source_converter(elementwise_(accum));
}else{
result = source_converter(elementwise_(accum, source_vector));
}
ElementLayernormCompute inv_scalar = cutlass::constants::one<ElementLayernormCompute>() / ElementLayernormCompute(extent_.column());
// Fragment is cleared for non-reachable columns so no need to check against column guard
ElementLayernormCompute accum_sum_element_tmp = element_sum_accumulator_(result);
// Square sum is different. Non-reachable columns should've been computed for shift-k
// Otherwise we will incorrectly have some extra k^2 added into square sum.
ElementLayernormCompute accum_sum_square_tmp = ElementLayernormCompute(0);
if (column_guard) {
accum_sum_square_tmp = (kIsShiftedVariance) ? \
square_sum_accumulator_(result, shift_k_frag_[iter_idx * kRowIterations + row_idx]) : \
square_sum_accumulator_(result);
}
accum_sum_element_tmp *= inv_scalar;
accum_sum_square_tmp *= inv_scalar;
// After performing the in-thread reduction, we then perform cross-thread / in-warp reduction
CUTLASS_PRAGMA_UNROLL
for (int i = kHalfThreadsPerRow; i > 0; i >>= 1) {
accum_sum_element_tmp += __shfl_xor_sync(0xFFFFFFFF, accum_sum_element_tmp, i);
accum_sum_square_tmp += __shfl_xor_sync(0xFFFFFFFF, accum_sum_square_tmp, i);
}
accum_sum_element_ += accum_sum_element_tmp;
accum_sum_square_ += accum_sum_square_tmp;
// Convert to the output
NumericArrayConverter<ElementOutput, ElementLayernormCompute, kElementsPerAccess> output_converter;
OutputVector &output = reinterpret_cast<OutputVector *>(&fragment_D_)[frag_idx];
output = output_converter(result);
}
/// Called at the start of a row
CUTLASS_DEVICE
void end_row(int row_idx) {
using ConvertVarianceOutput = cutlass::NumericConverter<ElementVariance, ElementLayernormCompute>;
using ConvertMeanOutput = cutlass::NumericConverter<ElementMean, ElementLayernormCompute>;
ConvertVarianceOutput convert_variance_output;
ConvertMeanOutput convert_mean_output;
bool is_write_thread = (thread_offset_.row() < extent_.row() && (threadIdx.x % kThreadsPerRow) == 0);
int row_offset = thread_offset_.row() + threadblock_tile_offset_.n() * extent_.row();
ElementVariance *curr_ptr_sum_square = params_.ptr_Variance + row_offset;
ElementMean *curr_ptr_element_sum = params_.ptr_Mean + row_offset;
arch::global_store<ElementVariance, sizeof(ElementVariance)>(
convert_variance_output(accum_sum_square_),
(void *)curr_ptr_sum_square,
is_write_thread);
arch::global_store<ElementMean, sizeof(ElementMean)>(
convert_mean_output(accum_sum_element_),
(void *)curr_ptr_element_sum,
is_write_thread);
}
/// Called after all accumulator elements have been visited
CUTLASS_DEVICE
void end_step(int step_idx) {
iterator_D_.store(fragment_D_);
++iterator_D_;
}
/// Called after all steps have been completed
CUTLASS_DEVICE
void end_epilogue() {
}
private:
CUTLASS_DEVICE
ElementLayernormCompute load_shift_k_(int row_offset, bool is_load) {
using ConvertShiftK = cutlass::NumericConverter<ElementLayernormCompute, ElementOutput>;
ConvertShiftK convert_shift_k;
ElementOutput shift_k_val;
// Computes the address to load shift_k element
ElementOutput *curr_ptr_shift_k = params_.ptr_Shifted_K + row_offset;
// Conditionally loads from global memory
arch::global_load<ElementOutput, sizeof(ElementOutput)>(shift_k_val, (void *)curr_ptr_shift_k, is_load);
// Converts data type to return
ElementLayernormCompute converted_shift_k_val = convert_shift_k(shift_k_val);
return converted_shift_k_val;
}
CUTLASS_DEVICE
ElementLayernormCompute square_sum_accumulator_(LayernormFragment const &accum) {
ElementLayernormCompute sum_ = ElementLayernormCompute(0);
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < LayernormFragment::kElements; ++i) {
auto accum_ = accum[i];
sum_ += accum_ * accum_;
}
return sum_;
}
CUTLASS_DEVICE
ElementLayernormCompute square_sum_accumulator_(LayernormFragment const &accum, ElementLayernormCompute shift_k_val) {
ElementLayernormCompute sum_ = ElementLayernormCompute(0);
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < LayernormFragment::kElements; ++i) {
auto accum_ = accum[i] - shift_k_val;
sum_ += accum_ * accum_;
}
return sum_;
}
CUTLASS_DEVICE
ElementLayernormCompute element_sum_accumulator_(LayernormFragment const &accum) {
ElementLayernormCompute sum_ = ElementLayernormCompute(0);
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < LayernormFragment::kElements; ++i) {
sum_ += accum[i];
}
return sum_;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,77 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind gemm related enum types to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/gemm/gemm.h"
#include "host.h"
namespace py = pybind11;
void bind_gemm(py::module &m) {
//
// Enumerate types
// cutlass/gemm/gemm.h
py::enum_<cutlass::gemm::GemmUniversalMode>(m, "Mode")
.value("Gemm", cutlass::gemm::GemmUniversalMode::kGemm, "Ordinary GEMM & GEMM Split-K serial")
.value("GemmSplitKParallel", cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel, "GEMM Split-K parallel")
.value("Batched", cutlass::gemm::GemmUniversalMode::kBatched, "Batched GEMM")
.value("Array", cutlass::gemm::GemmUniversalMode::kArray)
.value("Invalid", cutlass::gemm::GemmUniversalMode::kInvalid);
/// GemmCoord is a structure that specifies a location within the coordinate space of a GEMM problem
py::class_<cutlass::gemm::GemmCoord>(m, "GemmCoord")
.def(py::init<int, int, int>())
.def("m", py::overload_cast<>(&cutlass::gemm::GemmCoord::m))
.def("n", py::overload_cast<>(&cutlass::gemm::GemmCoord::n))
.def("k", py::overload_cast<>(&cutlass::gemm::GemmCoord::k))
// get tensor coords
.def("mk",
[](const cutlass::gemm::GemmCoord & problem_size) {
return cutlass::MatrixCoord(problem_size.mk());
})
.def("kn",
[](const cutlass::gemm::GemmCoord & problem_size) {
return cutlass::MatrixCoord(problem_size.kn());
})
.def("mn",
[](const cutlass::gemm::GemmCoord & problem_size) {
return cutlass::MatrixCoord(problem_size.mn());
});
py::module_ host_submodule = m.def_submodule("host");
bind_gemm_host_helper(host_submodule);
}

View File

@ -1,628 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/fast_math.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/gemm/kernel/params_universal_base.h"
#include "cutlass/matrix_coord.h"
#include "cutlass/complex.h"
#include "cutlass/semaphore.h"
#include "cutlass/layout/matrix.h"
#include "cutlass/trace.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace gemm {
namespace kernel {
/////////////////////////////////////////////////////////////////////////////////////////////////
template <
typename Mma_, ///! Threadblock-scoped matrix multiply-accumulate
typename Epilogue_, ///! Epilogue
typename ThreadblockSwizzle_ ///! Threadblock swizzling function
>
struct GemmUniversalwithEpilogueVisitor {
public:
using Mma = Mma_;
using Epilogue = Epilogue_;
using EpilogueVisitor = typename Epilogue::Visitor;
using ThreadblockSwizzle = ThreadblockSwizzle_;
using ElementA = typename Mma::IteratorA::Element;
using LayoutA = typename Mma::IteratorA::Layout;
using ElementB = typename Mma::IteratorB::Element;
using LayoutB = typename Mma::IteratorB::Layout;
using ElementC = typename EpilogueVisitor::ElementOutput;
using LayoutC = typename EpilogueVisitor::OutputTileIterator::Layout;
static ComplexTransform const kTransformA = Mma::kTransformA;
static ComplexTransform const kTransformB = Mma::kTransformB;
using Operator = typename Mma::Operator;
using OperatorClass = typename Mma::Operator::OperatorClass;
using ThreadblockShape = typename Mma::Shape;
using WarpShape = typename Mma::Operator::Shape;
using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
using ArchTag = typename Mma::ArchTag;
static int const kStages = Mma::kStages;
static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
static int const kAlignmentC = EpilogueVisitor::kElementsPerAccess;
/// Warp count (concept: GemmShape)
using WarpCount = typename Mma::WarpCount;
static int const kThreadCount = 32 * WarpCount::kCount;
/// Split-K preserves splits that are 128b aligned
static int const kSplitKAlignment = const_max(
128 / sizeof_bits<ElementA>::value,
128 / sizeof_bits<ElementB>::value
);
//
// Structures
//
/// Argument structure
struct Arguments : UniversalArgumentsBase {
//
// Data members
//
typename EpilogueVisitor::Arguments epilogue_visitor;
void const * ptr_A;
void const * ptr_B;
void const * ptr_C;
void * ptr_D;
int64_t batch_stride_A;
int64_t batch_stride_B;
int64_t batch_stride_C;
typename LayoutA::Stride stride_a;
typename LayoutB::Stride stride_b;
typename LayoutC::Stride stride_c;
typename LayoutC::Stride stride_d;
typename LayoutA::Stride::LongIndex lda;
typename LayoutB::Stride::LongIndex ldb;
typename LayoutC::Stride::LongIndex ldc;
typename LayoutC::Stride::LongIndex ldd;
int const * ptr_gather_A_indices;
int const * ptr_gather_B_indices;
int const * ptr_scatter_D_indices;
//
// Methods
//
Arguments():
ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr),
ptr_gather_A_indices(nullptr),
ptr_gather_B_indices(nullptr),
ptr_scatter_D_indices(nullptr) {}
/// constructs an arguments structure
Arguments(
GemmUniversalMode mode,
GemmCoord problem_size,
int batch_count,
typename EpilogueVisitor::Arguments epilogue_visitor,
void const * ptr_A,
void const * ptr_B,
void const * ptr_C,
void * ptr_D,
int64_t batch_stride_A,
int64_t batch_stride_B,
int64_t batch_stride_C,
int64_t batch_stride_D,
typename LayoutA::Stride stride_a,
typename LayoutB::Stride stride_b,
typename LayoutC::Stride stride_c,
typename LayoutC::Stride stride_d,
int const *ptr_gather_A_indices = nullptr,
int const *ptr_gather_B_indices = nullptr,
int const *ptr_scatter_D_indices = nullptr
):
UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
epilogue_visitor(epilogue_visitor),
ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
ptr_scatter_D_indices(ptr_scatter_D_indices) {
lda = 0;
ldb = 0;
ldc = 0;
ldd = 0;
CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
}
/// constructs an arguments structure
Arguments(
GemmUniversalMode mode,
GemmCoord problem_size,
int batch_count,
typename EpilogueVisitor::Arguments epilogue_visitor,
void const * ptr_A,
void const * ptr_B,
void const * ptr_C,
void * ptr_D,
int64_t batch_stride_A,
int64_t batch_stride_B,
int64_t batch_stride_C,
int64_t batch_stride_D,
typename LayoutA::Stride::LongIndex lda,
typename LayoutB::Stride::LongIndex ldb,
typename LayoutC::Stride::LongIndex ldc,
typename LayoutC::Stride::LongIndex ldd,
int const *ptr_gather_A_indices = nullptr,
int const *ptr_gather_B_indices = nullptr,
int const *ptr_scatter_D_indices = nullptr
):
UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
epilogue_visitor(epilogue_visitor),
ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
ptr_scatter_D_indices(ptr_scatter_D_indices) {
stride_a = make_Coord(lda);
stride_b = make_Coord(ldb);
stride_c = make_Coord(ldc);
stride_d = make_Coord(ldd);
CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
}
/// Returns arguments for the transposed problem
Arguments transposed_problem() const {
Arguments args(*this);
std::swap(args.problem_size.m(), args.problem_size.n());
std::swap(args.ptr_A, args.ptr_B);
std::swap(args.lda, args.ldb);
std::swap(args.stride_a, args.stride_b);
std::swap(args.batch_stride_A, args.batch_stride_B);
std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
return args;
}
};
//
// Structure for precomputing values in host memory and passing to kernels
//
/// Parameters structure
struct Params : UniversalParamsBase<
ThreadblockSwizzle,
ThreadblockShape,
ElementA,
ElementB,
ElementC> {
using ParamsBase = UniversalParamsBase<
ThreadblockSwizzle,
ThreadblockShape,
ElementA,
ElementB,
ElementC>;
typename Mma::IteratorA::Params params_A;
typename Mma::IteratorB::Params params_B;
typename EpilogueVisitor::OutputTileIterator::Params params_C;
typename EpilogueVisitor::OutputTileIterator::Params params_D;
typename EpilogueVisitor::Params epilogue_visitor;
void * ptr_A;
void * ptr_B;
void * ptr_C;
void * ptr_D;
int64_t batch_stride_A;
int64_t batch_stride_B;
int64_t batch_stride_C;
int * ptr_gather_A_indices;
int * ptr_gather_B_indices;
int * ptr_scatter_D_indices;
int *semaphore;
//
// Methods
//
/// Default constructor
Params() = default;
CUTLASS_HOST_DEVICE
Params(
Arguments const &args,
int device_sms,
int sm_occupancy
):
ParamsBase(args, device_sms, sm_occupancy),
params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
epilogue_visitor(args.epilogue_visitor),
ptr_A(const_cast<void *>(args.ptr_A)),
ptr_B(const_cast<void *>(args.ptr_B)),
ptr_C(const_cast<void *>(args.ptr_C)),
ptr_D(args.ptr_D),
batch_stride_A(args.batch_stride_A),
batch_stride_B(args.batch_stride_B),
batch_stride_C(args.batch_stride_C),
ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices)) {
}
CUTLASS_HOST_DEVICE
void update(
Arguments const &args,
void *workspace = nullptr) {
ptr_A = const_cast<void *>(args.ptr_A);
ptr_B = const_cast<void *>(args.ptr_B);
ptr_C = const_cast<void *>(args.ptr_C);
ptr_D = args.ptr_D;
ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
batch_stride_A = args.batch_stride_A;
batch_stride_B = args.batch_stride_B;
batch_stride_C = args.batch_stride_C;
epilogue_visitor = args.epilogue_visitor;
semaphore = static_cast<int *>(workspace);
CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
}
};
/// Shared memory storage structure
union SharedStorage {
typename Mma::SharedStorage main_loop;
typename Epilogue::SharedStorage epilogue;
typename EpilogueVisitor::SharedStorage visitor;
};
public:
//
// Methods
//
CUTLASS_DEVICE
GemmUniversalwithEpilogueVisitor() { }
/// Determines whether kernel satisfies alignment
static Status can_implement(
cutlass::gemm::GemmCoord const & problem_size) {
CUTLASS_TRACE_HOST("GemmUniversalwithEpilogueVisitor::can_implement()");
static int const kAlignmentA = (platform::is_same<LayoutA,
layout::ColumnMajorInterleaved<32>>::value)
? 32
: (platform::is_same<LayoutA,
layout::ColumnMajorInterleaved<64>>::value)
? 64
: Mma::IteratorA::AccessType::kElements;
static int const kAlignmentB = (platform::is_same<LayoutB,
layout::RowMajorInterleaved<32>>::value)
? 32
: (platform::is_same<LayoutB,
layout::RowMajorInterleaved<64>>::value)
? 64
: Mma::IteratorB::AccessType::kElements;
static int const kAlignmentC = (platform::is_same<LayoutC,
layout::ColumnMajorInterleaved<32>>::value)
? 32
: (platform::is_same<LayoutC,
layout::ColumnMajorInterleaved<64>>::value)
? 64
: Epilogue::OutputTileIterator::kElementsPerAccess;
bool isAMisaligned = false;
bool isBMisaligned = false;
bool isCMisaligned = false;
if (platform::is_same<LayoutA, layout::RowMajor>::value) {
isAMisaligned = problem_size.k() % kAlignmentA;
} else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
isAMisaligned = problem_size.m() % kAlignmentA;
} else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
|| platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
isAMisaligned = problem_size.k() % kAlignmentA;
}
if (platform::is_same<LayoutB, layout::RowMajor>::value) {
isBMisaligned = problem_size.n() % kAlignmentB;
} else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
isBMisaligned = problem_size.k() % kAlignmentB;
} else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
|| platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
isBMisaligned = problem_size.k() % kAlignmentB;
}
if (platform::is_same<LayoutC, layout::RowMajor>::value) {
isCMisaligned = problem_size.n() % kAlignmentC;
} else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
isCMisaligned = problem_size.m() % kAlignmentC;
} else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
|| platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
isCMisaligned = problem_size.n() % kAlignmentC;
}
if (isAMisaligned) {
CUTLASS_TRACE_HOST(" returning kErrorMisalignedOperand for A operand");
return Status::kErrorMisalignedOperand;
}
if (isBMisaligned) {
CUTLASS_TRACE_HOST(" returning kErrorMisalignedOperand for B operand");
return Status::kErrorMisalignedOperand;
}
if (isCMisaligned) {
CUTLASS_TRACE_HOST(" returning kErrorMisalignedOperand for C operand");
return Status::kErrorMisalignedOperand;
}
CUTLASS_TRACE_HOST(" returning kSuccess");
return Status::kSuccess;
}
static Status can_implement(Arguments const &args) {
return can_implement(args.problem_size);
}
/// Executes one GEMM
CUTLASS_DEVICE
void operator()(Params const &params, SharedStorage &shared_storage) {
// Compute threadblock location
ThreadblockSwizzle threadblock_swizzle;
cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
// Early exit if CTA is out of range
if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
return;
}
int offset_k = 0;
int problem_size_k = params.problem_size.k();
ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
//
// Fetch pointers based on mode.
//
if (params.mode == GemmUniversalMode::kGemm ||
params.mode == GemmUniversalMode::kGemmSplitKParallel) {
if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
}
offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
}
else if (params.mode == GemmUniversalMode::kBatched) {
ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
}
else if (params.mode == GemmUniversalMode::kArray) {
ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
}
__syncthreads();
// Compute initial location in logical coordinates
cutlass::MatrixCoord tb_offset_A{
threadblock_tile_offset.m() * Mma::Shape::kM,
offset_k,
};
cutlass::MatrixCoord tb_offset_B{
offset_k,
threadblock_tile_offset.n() * Mma::Shape::kN
};
// Compute position within threadblock
int thread_idx = threadIdx.x;
// Construct iterators to A and B operands
typename Mma::IteratorA iterator_A(
params.params_A,
ptr_A,
{params.problem_size.m(), problem_size_k},
thread_idx,
tb_offset_A,
params.ptr_gather_A_indices);
typename Mma::IteratorB iterator_B(
params.params_B,
ptr_B,
{problem_size_k, params.problem_size.n()},
thread_idx,
tb_offset_B,
params.ptr_gather_B_indices);
// Broadcast the warp_id computed by lane 0 to ensure dependent code
// is compiled as warp-uniform.
int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
int lane_idx = threadIdx.x % 32;
//
// Main loop
//
// Construct thread-scoped matrix multiply
Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
typename Mma::FragmentC accumulators;
accumulators.clear();
// Compute threadblock-scoped matrix multiply-add
int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
// Compute threadblock-scoped matrix multiply-add
mma(
gemm_k_iterations,
accumulators,
iterator_A,
iterator_B,
accumulators);
//
// Epilogue
//
// EpilogueOutputOp output_op(params.output_op);
//
// Masked tile iterators constructed from members
//
threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
//assume identity swizzle
MatrixCoord threadblock_offset(
threadblock_tile_offset.m() * Mma::Shape::kM,
threadblock_tile_offset.n() * Mma::Shape::kN
);
int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
//
// Fetch pointers based on mode.
//
// Construct the semaphore.
Semaphore semaphore(params.semaphore + block_idx, thread_idx);
// Tile iterator loading from source tensor.
EpilogueVisitor epilogue_visitor(
params.epilogue_visitor,
shared_storage.visitor,
threadblock_offset,
threadblock_tile_offset,
thread_idx,
params.problem_size.mn()
);
if (params.mode == GemmUniversalMode::kBatched || params.mode == GemmUniversalMode::kArray) {
epilogue_visitor.set_batch_index(threadblock_tile_offset.k());
}
Epilogue epilogue(
shared_storage.epilogue,
thread_idx,
warp_idx,
lane_idx);
// Wait on the semaphore - this latency may have been covered by iterator construction
if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
// For subsequent threadblocks, the source matrix is held in the 'D' tensor.
semaphore.wait(threadblock_tile_offset.k());
}
// Execute the epilogue operator to update the destination tensor.
epilogue(epilogue_visitor, accumulators);
//
// Release the semaphore
//
if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
int lock = 0;
if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
// The final threadblock resets the semaphore for subsequent grids.
lock = 0;
}
else {
// Otherwise, the semaphore is incremented
lock = threadblock_tile_offset.k() + 1;
}
semaphore.release(lock);
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,47 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind gemm host helpers to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/util/host_reorder.h"
#include "cutlass/layout/tensor.h"
namespace py = pybind11;
void bind_gemm_host_helper(py::module &m) {
m.def("reorder_column", &cutlass::reorder_column<32, int8_t, cutlass::layout::RowMajorInterleaved<32>>);
m.def("reorder_column", &cutlass::reorder_column<32, int8_t, cutlass::layout::ColumnMajorInterleaved<32>>);
}

View File

@ -1,47 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind CUTLASS layouts to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "tensor.h"
#include "matrix.h"
namespace py = pybind11;
void bind_layout(py::module &m) {
bind_tensor_layout(m);
bind_matrix_layout(m);
}

View File

@ -1,87 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind Matrix layouts to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/layout/matrix.h"
namespace py = pybind11;
void bind_matrix_layout(py::module &m) {
//
// Matrix layouts
// cutlass/layout/matrix.h
//
py::class_<cutlass::layout::RowMajor>(m, "RowMajor", R"pbdoc(
Mapping function for row-major matrices.
)pbdoc")
.def_static("packed", &cutlass::layout::RowMajor::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
.def("stride", [](const cutlass::layout::RowMajor & layout){
return layout.stride().at(0);
}, R"pbdoc(Returns the stride of the layout)pbdoc");
py::class_<cutlass::layout::ColumnMajor>(m, "ColumnMajor", R"pbdoc(
Mapping function for column-major matrices.
)pbdoc")
.def_static("packed", &cutlass::layout::ColumnMajor::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc" )
.def("stride", [](const cutlass::layout::ColumnMajor & layout){
return layout.stride().at(0);
}, R"pbdoc(Returns the stride of the layout)pbdoc");
py::class_<cutlass::layout::RowMajorInterleaved<32>>(m, "RowMajorInterleaved32",
R"pbdoc(Mapping function for interleaved matrices. Matrix is structured
as row-major arrangement of fixed-size columns 32)pbdoc")
.def_static("packed", &cutlass::layout::RowMajorInterleaved<32>::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
.def("stride", [](const cutlass::layout::RowMajorInterleaved<32> & layout){
return layout.stride().at(0);
}, R"pbdoc(Returns the stride of the layout)pbdoc");
py::class_<cutlass::layout::ColumnMajorInterleaved<32>>(m, "ColumnMajorInterleaved32",
R"pbdoc(Mapping function for interleaved matrices. Matrix is structured
as column-major arrangement of fixed-size rows 32)pbdoc")
.def_static("packed", &cutlass::layout::ColumnMajorInterleaved<32>::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
.def("stride", [](const cutlass::layout::ColumnMajorInterleaved<32> & layout){
return layout.stride().at(0);
}, R"pbdoc(Returns the stride of the layout)pbdoc");
}

View File

@ -1,74 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind Tensor layouts to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/layout/tensor.h"
namespace py = pybind11;
void bind_tensor_layout(py::module &m) {
//
// Tensor layouts
// cutlass/include/cutlass/layout/tensor.h
//
/// Mapping function for 4-D NHWC tensors.
py::class_<cutlass::layout::TensorNHWC>(m, "TensorNHWC",
R"pbdoc(Mapping function for 4-D NHWC tensors)pbdoc")
.def_static("packed", &cutlass::layout::TensorNHWC::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed NHWC tensor)pbdoc")
.def("stride", py::overload_cast<>(&cutlass::layout::TensorNHWC::stride),
R"pbdoc(Returns the stride of the layout)pbdoc");
/// Mapping function for 4-D NC/xHWx tensors.
py::class_<cutlass::layout::TensorNCxHWx<32>>(m, "TensorNC32HW32",
R"pbdoc(Mapping function for 4-D NC/32HW32 tensors)pbdoc")
.def_static("packed", &cutlass::layout::TensorNCxHWx<32>::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
.def("stride", py::overload_cast<>(&cutlass::layout::TensorNCxHWx<32>::stride),
R"pbdoc(Returns the stride of the layout)pbdoc");
/// Mapping function for 4-D CxRSKx tensors.
py::class_<cutlass::layout::TensorCxRSKx<32>>(m, "TensorC32RSK32",
R"pbdoc(Mapping function for 4-D C32RSK32 tensors)pbdoc")
.def_static("packed", &cutlass::layout::TensorCxRSKx<32>::packed,
py::arg("extent"),
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
.def("stride", py::overload_cast<>(&cutlass::layout::TensorCxRSKx<32>::stride),
R"pbdoc(Returns the stride of the layout)pbdoc");
}

View File

@ -1,159 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind threadblock swizzling to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
#include "cutlass/conv/threadblock/threadblock_swizzle.h"
#include <cxxabi.h>
#include <cuda_runtime.h>
namespace py = pybind11;
std::string demangle(const char* mangled_name) {
std::size_t len = 0;
int status = 0;
std::unique_ptr<char> ptr(
__cxxabiv1::__cxa_demangle(mangled_name, nullptr, &len, &status));
return ptr.get();
}
template<typename T>
void bind_identity_swizzle(py::module & m, std::string name) {
py::class_<T>(m, name.c_str(),
R"pbdoc(Threadblock swizzling function for GEMMs)pbdoc")
.def(py::init<>())
.def("get_tiled_shape",
py::overload_cast<cutlass::gemm::GemmCoord, cutlass::gemm::GemmCoord, int>(
&T::get_tiled_shape, py::const_
), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
R"pbdoc(Returns the shape of the problem in units of logical tiles
:param problem_size: gemm(M, N, K)
:type problem_size: :class:`cutlass.gemm.GemmCoord`
)pbdoc")
.def("get_tiled_shape",
py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&, cutlass::gemm::GemmCoord, int>(
&T::get_tiled_shape, py::const_
), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
R"pbdoc(Returns the shape of the problem in units of logical tiles
:param problem_size: Implicit gemm problem size conv_operator(NPQK, NHWC, KRSC)
:type problem_size: :class:`cutlass.gemm.GemmCoord`)
)pbdoc")
.def("get_tiled_shape",
py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv3dProblemSize&, cutlass::gemm::GemmCoord, int>(
&T::get_tiled_shape, py::const_
), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
R"pbdoc(Returns the shape of the problem in units of logical tiles
:param problem_size: Implicit gemm problem size conv_operator(NZPQK, NDHWC, KTRSC)
:type problem_size: :class:`cutlass.gemm.GemmCoord`)
)pbdoc")
.def("get_grid_shape", &T::get_grid_shape,
py::arg("tiled_shape"),
R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
.def("tag", [](const T & swizzle){
return demangle(typeid(T).name());
}, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
}
template<typename T>
void bind_swizzle(py::module & m, std::string name, std::string doc) {
py::class_<T>(m, name.c_str(), doc.c_str())
.def(py::init<>())
.def("get_tiled_shape",
py::overload_cast<cutlass::gemm::GemmCoord, cutlass::gemm::GemmCoord, int>(
&T::get_tiled_shape, py::const_
), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
R"pbdoc(Returns the shape of the problem in units of logical tiles
:param problem_size: gemm(M, N, K)
:type problem_size: :class:`cutlass.gemm.GemmCoord`
)pbdoc")
.def("get_grid_shape", &T::get_grid_shape,
py::arg("tiled_shape"),
R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
.def("tag", [](const T & swizzle){
return demangle(typeid(T).name());
}, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
}
template<typename T>
void bind_dgrad_swizzle(py::module & m, std::string name) {
py::class_<T>(m, name.c_str(),
R"pbdoc(Threadblock swizzling function for strided dgrad convolution)pbdoc")
.def(py::init<>())
.def("get_tiled_shape",
py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&, cutlass::gemm::GemmCoord, int>(
&T::get_tiled_shape, py::const_
), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
R"pbdoc(Returns the shape of the problem in units of logical tiles
:param problem_size: Implicit gemm problem size conv_operator(NPQK, NHWC, KRSC)
:type problem_size: :class:`cutlass.gemm.GemmCoord`)
)pbdoc")
.def("get_grid_shape", [](const T & swizzle, cutlass::gemm::GemmCoord tiled_shape) {
return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
}, py::arg("tiled_shape"),
R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
.def("tag", [](const T & swizzle){
return demangle(typeid(T).name());
}, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
}
void bind_threadblock_swizzle(py::module &m) {
py::class_<dim3>(m, "dim3",
R"pbdoc(A int3 type xyz contains three integers)pbdoc")
.def(py::init<int, int, int>(),
py::arg("x"), py::arg("y"), py::arg("z"))
.def_readwrite("x", &dim3::x, R"pbdoc(get value x)pbdoc")
.def_readwrite("y", &dim3::y, R"pbdoc(get value y)pbdoc")
.def_readwrite("z", &dim3::z, R"pbdoc(get value z)pbdoc");
bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>>(m, "IdentitySwizzle1");
bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>>(m, "IdentitySwizzle2");
bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>>(m, "IdentitySwizzle4");
bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>>(m, "IdentitySwizzle8");
bind_swizzle<cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle>(m, "HorizontalSwizzle", R"pbdoc(Threadblock swizzling function for GEMMs)pbdoc");
bind_swizzle<cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle>(m, "BatchedIdentitySwizzle", R"pbdoc(Threadblock swizzling function for batched GEMMs)pbdoc");
bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>>(m, "StridedDgradIdentitySwizzle1");
bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>>(m, "StridedDgradIdentitySwizzle4");
bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle>(m, "StridedDgradHorizontalSwizzle");
}

View File

@ -1,78 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind Tensor Coord to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/tensor_coord.h"
namespace py = pybind11;
void bind_tensor_coord(py::module &m) {
//
// Tensor Coords
// cutlass/include/cutlass/tensor_coord.h
//
/// Defines a canonical 4D coordinate used by tensor operations.
py::class_<cutlass::Tensor4DCoord>(m, "Tensor4DCoord",
R"pbdoc(Defines a canonical 4D coordinate used by tensor operations)pbdoc")
.def(py::init<int, int, int, int>(),
py::arg("n"), py::arg("h"), py::arg("w"), py::arg("c"),
R"pbdoc(Helper to construct from N, H, W, and C)pbdoc")
.def("at", py::overload_cast<int>(&cutlass::Tensor4DCoord::at),
py::arg("dim"),
R"pbdoc(Gets the index of a given Coord element)pbdoc")
.def("size", [](const cutlass::Tensor4DCoord & coord) {
return coord.at(0) * coord.at(1) * coord.at(2) * coord.at(3);},
R"pbdoc(The size of the tensor coord)pbdoc");
py::class_<cutlass::Coord<3>>(m, "Tensor3DCoord",
R"pbdoc(Defines a canonical 3D coordinate used by tensor operations)pbdoc")
.def("at", py::overload_cast<int>(&cutlass::Coord<3>::at),
py::arg("dim"),
R"pbdoc(Gets the index of a given Coord element)pbdoc");
// Matrix Size
py::class_<cutlass::MatrixCoord>(m, "MatrixCoord",
R"pbdoc(MatrixCoord wraps Coord<2, int> to provide a helper for accessing named dimensions. Classes
expecting a coordinate in the rank=2 index space of a matrix should use MatrixCoord.)pbdoc")
.def(py::init<int, int>(),
py::arg("row"), py::arg("column"), R"pbdoc(Helper to construct from a row and column)pbdoc")
.def("row", py::overload_cast<>(&cutlass::MatrixCoord::row),
R"pbdoc(Returns the row of the coordinate)pbdoc")
.def("column", py::overload_cast<>(&cutlass::MatrixCoord::column),
R"pbdoc(Returns the column of the coordinate)pbdoc");
}

View File

@ -1,102 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind TensorRef and View to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/tensor_ref.h"
#include "cutlass/tensor_view.h"
#include "types.h"
template<typename T, typename L, typename TF>
void bind_tensor_ref_view(py::module &m, std::string name) {
py::class_<cutlass::TensorRef<T, L>>(m, ("TensorRef" + name).c_str())
.def("__init__", [](cutlass::TensorRef<T, L>& tensor_ref, int64_t address, const L& layout_ ) {
T* ptr = reinterpret_cast< T*>(address);
new (&tensor_ref) cutlass::TensorRef<T, L>(ptr, layout_);
})
.def("data", [](cutlass::TensorRef<T, L>& tensor_ref) {
T* ptr = tensor_ref.data();
return int64_t(ptr);
})
.def("layout", py::overload_cast<>(&cutlass::TensorRef<T, L>::layout));
m.def("get_tensor_ref", [](int64_t address, TF data, const L& layout_) {
T* ptr = reinterpret_cast<T*>(address);
cutlass::TensorRef<T, L> tensor_ref = cutlass::TensorRef<T, L>(ptr, layout_);
return tensor_ref;
});
py::class_<cutlass::TensorView<T, L>>(m, ("TensorView" + name).c_str())
.def(py::init<const cutlass::TensorRef<T, L>&, const typename L::TensorCoord &>());
}
void bind_tensor_refs_and_views(py::module &m) {
/// float
bind_tensor_ref_view<float, cutlass::layout::RowMajor, cutlass::float32>(m, "F32RowMajor");
bind_tensor_ref_view<float, cutlass::layout::ColumnMajor, cutlass::float32>(m, "F32ColumnMajor");
bind_tensor_ref_view<float, cutlass::layout::TensorNHWC, cutlass::float32>(m, "F32NHWC");
/// double
bind_tensor_ref_view<double, cutlass::layout::RowMajor, cutlass::float64>(m, "F64RowMajor");
bind_tensor_ref_view<double, cutlass::layout::ColumnMajor, cutlass::float64>(m, "F64ColumnMajor");
bind_tensor_ref_view<double, cutlass::layout::TensorNHWC, cutlass::float64>(m, "F64NHWC");
// half_t
bind_tensor_ref_view<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t>(m, "F16RowMajor");
bind_tensor_ref_view<cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t>(m, "F16ColumnMajor");
bind_tensor_ref_view<cutlass::half_t, cutlass::layout::TensorNHWC, cutlass::half_t>(m, "F16NHWC");
// bfloat16
bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t>(m, "BF16RowMajor");
bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::ColumnMajor, cutlass::bfloat16_t>(m, "BF16ColumnMajor");
bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::TensorNHWC, cutlass::bfloat16_t>(m, "BF16NHWC");
// int8_t
bind_tensor_ref_view<int8_t, cutlass::layout::RowMajorInterleaved<32>, cutlass::int8>(m, "S8RowMajorInterleaved32");
bind_tensor_ref_view<int8_t, cutlass::layout::ColumnMajorInterleaved<32>, cutlass::int8>(m, "S8ColumnMajorInterleaved32");
bind_tensor_ref_view<int8_t, cutlass::layout::RowMajor, cutlass::int8>(m, "S8RowMajor");
bind_tensor_ref_view<int8_t, cutlass::layout::ColumnMajor, cutlass::int8>(m, "S8ColumnMajor");
bind_tensor_ref_view<int8_t, cutlass::layout::TensorNHWC, cutlass::int8>(m, "S8NHWC");
bind_tensor_ref_view<int8_t, cutlass::layout::TensorNCxHWx<32>, cutlass::int8>(m, "S8NC32HW32");
bind_tensor_ref_view<int8_t, cutlass::layout::TensorCxRSKx<32>, cutlass::int8>(m, "S8C32RSK32");
// int32_t
bind_tensor_ref_view<int32_t, cutlass::layout::RowMajor, cutlass::int32>(m, "S32RowMajor");
bind_tensor_ref_view<int32_t, cutlass::layout::ColumnMajor, cutlass::int32>(m, "S32ColumnMajor");
bind_tensor_ref_view<int32_t, cutlass::layout::TensorNHWC, cutlass::int32>(m, "S32NHWC");
}

View File

@ -1,146 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind CUTLASS types to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/half.h"
namespace py = pybind11;
namespace cutlass {
/// IEEE 32-bit signed integer
struct alignas(1) int8 {
int8_t storage;
explicit int8(int x) {
storage = int8_t(x);
}
explicit int8(float x) {
storage = int8_t(x);
}
int8_t c_value(){return storage;}
};
/// IEEE 32-bit signed integer
struct alignas(4) int32 {
int storage;
explicit int32(int x) {
storage = x;
}
explicit int32(float x) {
storage = int(x);
}
int c_value(){return storage;}
};
/// IEEE single-precision floating-point type
struct alignas(4) float32 {
float storage;
explicit float32(float x) {
storage = x;
}
explicit float32(int x) {
storage = float(x);
}
float c_value(){return storage;}
};
/// IEEE double-precision floating-point type
struct alignas(4) float64 {
double storage;
explicit float64(float x) {
storage = double(x);
}
explicit float64(int x) {
storage = double(x);
}
double c_value(){return storage;}
};
}
void bind_cutlass_types(py::module &m) {
// s8
py::class_<cutlass::int8>(m, "int8")
.def(py::init<float>())
.def(py::init<int>())
.def_readwrite("storage", &cutlass::int8::storage)
.def("value", &cutlass::int8::c_value);
// s32
py::class_<cutlass::int32>(m, "int32")
.def(py::init<float>())
.def(py::init<int>())
.def_readwrite("storage", &cutlass::int32::storage)
.def("value", &cutlass::int32::c_value);
// f16
py::class_<cutlass::half_t>(m, "float16")
.def(py::init<float>())
.def(py::init<double>())
.def(py::init<int>())
.def(py::init<unsigned>())
.def_readwrite("storage", &cutlass::half_t::storage)
.def("value", [](const cutlass::half_t& value) {return value;});
// bf16
py::class_<cutlass::bfloat16_t>(m, "bfloat16")
.def(py::init<float>())
.def(py::init<int>())
.def_readwrite("storage", &cutlass::bfloat16_t::storage)
.def("value", [](const cutlass::bfloat16_t& value) {return value;});
// f32
py::class_<cutlass::float32>(m, "float32")
.def(py::init<float>())
.def(py::init<int>())
.def_readwrite("storage", &cutlass::float32::storage)
.def("value", &cutlass::float32::c_value);
// tf32
py::class_<cutlass::tfloat32_t>(m, "tfloat32")
.def(py::init<float>())
.def(py::init<int>())
.def_readwrite("storage", &cutlass::tfloat32_t::storage)
.def("value", [](const cutlass::tfloat32_t& value) {return value;});
// f64
py::class_<cutlass::float64>(m, "float64")
.def(py::init<float>())
.def(py::init<int>())
.def_readwrite("storage", &cutlass::float64::storage)
.def("value", &cutlass::float64::c_value);
}

View File

@ -1,32 +0,0 @@
#include <cutlass/complex.h>
namespace cutlass {
/// ENUM class for datatypes
enum class DataType {
kB1, kU2, kU4, kU8,
kU16, kU32, kU64, kS2,
kS4, kS8, kS16, kS32,
kS64, kF16, kBF16, kF32,
kTF32, kF64, kCF16, kCBF16,
kCF32, kCTF32, kCF64, kCS2,
kCS4, kCS8, kCS16, kCS32,
kCS64, kCU2, kCU4, kCU8,
kCU16, kCU32, kCU64, kInvalid
};
/// ENUM class for LayoutTypes
enum class LayoutType {
kColumnMajor, kRowMajor,
kColumnMajorInterleaved2, kRowMajorInterleaved2,
kColumnMajorInterleaved32, kRowMajorInterleaved32,
kColumnMajorInterleaved64, kRowMajorInterleaved64,
kTensorNHWC, kTensorNDHWC, kTensorNCHW, kTensorNGHWC,
kTensorNC32HW32, kTensorNC64HW64, kTensorC32RSK32,
kTensorC64RSK64
};
/// ENUM class for opcode class
} // namespace cutlass

View File

@ -1,54 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind convolution problems to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "unit/conv/device/conv2d_problems.h"
#include "cutlass/conv/conv2d_problem_size.h"
namespace py = pybind11;
PYBIND11_MAKE_OPAQUE(std::vector<cutlass::conv::Conv2dProblemSize>);
void bind_conv_problem_size_test(py::module &m) {
py::bind_vector<std::vector<cutlass::conv::Conv2dProblemSize>>(m, "Conv2dProblemVector")
.def("size", &std::vector<cutlass::conv::Conv2dProblemSize>::size);
// Get Conv2d problem sizes
py::class_<test::conv::device::TestbedConv2dProblemSizes>(m, "TestbedConv2dProblemSizes")
.def(py::init<int>())
.def_readonly("conv2d_default_sizes", &test::conv::device::TestbedConv2dProblemSizes::conv2d_default_sizes);
}

View File

@ -1,49 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind convolution related types to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "conv_problems.h"
#include "host.h"
namespace py = pybind11;
void bind_convolution_test(py::module &m) {
// Conv problem sizes
bind_conv_problem_size_test(m);
py::module_ host_submodule = m.def_submodule("host");
bind_conv_host_references(host_submodule);
}

View File

@ -1,180 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind Convolution host test helpers to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "unit/conv/device/cache_testbed_output.h"
#include "cutlass/util/reference/host/convolution.h"
#include "cutlass/util/reference/host/tensor_compare.h"
namespace py = pybind11;
template<typename Ta, typename La, typename Tb, typename Lb, typename Tc, typename Lc, typename Tacc, typename Te>
void bind_conv2d_host(py::module &m) {
m.def("conv2d", \
&cutlass::reference::host::Conv2d< \
Ta, La, Tb, Lb, Tc, Lc, Te, Tacc>);
m.def("CreateCachedConv2dTestKey", &test::conv::device::CreateCachedConv2dTestKey<Ta, La, Tb, Lb, Tc, Lc, Tacc, Te>);
}
template<typename Ta, typename La, typename Tb, typename Lb, typename Tc, typename Lc, typename Tacc, typename Te>
void bind_conv2d_host_sat(py::module &m) {
m.def("conv2d", \
&cutlass::reference::host::Conv2d< \
Ta, La, Tb, Lb, Tc, Lc, Te, Tacc, cutlass::NumericConverterClamp<Tc, Te>>);
m.def("CreateCachedConv2dTestKey", &test::conv::device::CreateCachedConv2dTestKey<Ta, La, Tb, Lb, Tc, Lc, Tacc, Te>);
}
template<typename Ta, typename Tb, typename Tc, typename Tacc, typename Te>
void bind_conv2d_host_nhwc(py::module &m) {
bind_conv2d_host<
Ta, cutlass::layout::TensorNHWC,
Tb, cutlass::layout::TensorNHWC,
Tc, cutlass::layout::TensorNHWC,
Tacc, Te>(m);
}
template<typename Ta, typename Tb, typename Tc, typename Tacc, typename Te>
void bind_conv2d_host_nc32hw32(py::module &m) {
bind_conv2d_host_sat<
Ta, cutlass::layout::TensorNCxHWx<32>,
Tb, cutlass::layout::TensorCxRSKx<32>,
Tc, cutlass::layout::TensorNCxHWx<32>,
Tacc, Te>(m);
}
template<typename T, typename Layout>
void bind_tensor_equals(py::module &m) {
m.def("equals", py::overload_cast<
const cutlass::TensorView<T, Layout>&, const cutlass::TensorView<T, Layout>&>(
&cutlass::reference::host::TensorEquals<T, Layout>
));
}
#define BIND_TENSOR_HASH(Element, Layout) { \
m.def("TensorHash", &test::conv::device::TensorHash<Element, Layout>, py::arg("view"), py::arg("hash") = test::conv::device::CRC32(), py::arg("crc")=uint32_t()); \
}
void bind_conv_host_references(py::module &m) {
//
// Conv2d reference on host
// tools/util/include/cutlass/util/reference/host/convolution.h
/// double
bind_conv2d_host_nhwc<double, double, double, double, double>(m);
/// float
bind_conv2d_host_nhwc<float, float, float, float, float>(m);
/// half
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, cutlass::half_t>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, float>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, float>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, cutlass::half_t>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, float, cutlass::half_t>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, float, float>(m);
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, float>(m);
/// bfloat16
bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, cutlass::bfloat16_t>(m);
bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, float>(m);
bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, cutlass::bfloat16_t>(m);
bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, float>(m);
/// s8
bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, float>(m);
//
// Compare whether two tensors are equal
//
/// double
bind_tensor_equals<double, cutlass::layout::TensorNHWC>(m);
/// float
bind_tensor_equals<float, cutlass::layout::TensorNHWC>(m);
/// half
bind_tensor_equals<cutlass::half_t, cutlass::layout::TensorNHWC>(m);
/// bfloat16
bind_tensor_equals<cutlass::bfloat16_t, cutlass::layout::TensorNHWC>(m);
/// s32
bind_tensor_equals<int32_t, cutlass::layout::TensorNHWC>(m);
bind_tensor_equals<int32_t, cutlass::layout::TensorNCxHWx<32>>(m);
/// s8
bind_tensor_equals<int8_t, cutlass::layout::TensorNHWC>(m);
bind_tensor_equals<int8_t, cutlass::layout::TensorNCxHWx<32>>(m);
/// Cache
py::class_<test::conv::device::CachedTestKey>(m, "CachedTestKey")
.def(py::init<>())
.def(py::init<std::string, std::string, std::string, uint32_t, uint32_t, uint32_t>());
py::class_<test::conv::device::CachedTestResult>(m, "CachedTestResult")
.def(py::init<>())
.def(py::init<uint32_t>())
.def_readwrite("D", &test::conv::device::CachedTestResult::D);
py::class_<test::conv::device::CachedTestResultListing>(m, "CachedTestResultListing")
.def(py::init<const std::string &>())
.def("find", &test::conv::device::CachedTestResultListing::find)
.def("append", &test::conv::device::CachedTestResultListing::append)
.def("write", &test::conv::device::CachedTestResultListing::write);
py::class_<test::conv::device::CRC32>(m, "CRC32")
.def(py::init<>());
BIND_TENSOR_HASH(double, cutlass::layout::TensorNHWC)
BIND_TENSOR_HASH(float, cutlass::layout::TensorNHWC);
BIND_TENSOR_HASH(cutlass::half_t, cutlass::layout::TensorNHWC);
BIND_TENSOR_HASH(cutlass::bfloat16_t, cutlass::layout::TensorNHWC);
BIND_TENSOR_HASH(int32_t, cutlass::layout::TensorNHWC);
BIND_TENSOR_HASH(int8_t, cutlass::layout::TensorNCxHWx<32>);
}

View File

@ -1,45 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind gemm test to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "host.h"
namespace py = pybind11;
void bind_gemm_test(py::module &m) {
py::module_ host_submodule = m.def_submodule("host");
bind_gemm_host_reference(host_submodule);
}

View File

@ -1,431 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/* \file
\brief Bind gemm test host functions to python
*/
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl_bind.h>
#include "cutlass/cutlass.h"
#include "cutlass/util/reference/host/gemm.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/host_reorder.h"
#include "cutlass/functional.h"
namespace py = pybind11;
template<
typename ElementA, typename LayoutA,
typename ElementB, typename LayoutB,
typename ElementC, typename LayoutC,
typename AccumulatorType, typename ComputeType,
typename InnerProductOp>
void bind_host_gemm_saturate(py::module &m) {
m.def("gemm_saturate", py::overload_cast<
cutlass::gemm::GemmCoord, ComputeType,
cutlass::TensorRef<ElementA, LayoutA>,
cutlass::TensorRef<ElementB, LayoutB>,
ComputeType,
cutlass::TensorRef<ElementC, LayoutC>,
cutlass::TensorRef<ElementC, LayoutC>,
AccumulatorType>(
&cutlass::reference::host::compute_gemm<
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
ComputeType,
AccumulatorType,
InnerProductOp,
cutlass::NumericConverterClamp<ElementC, AccumulatorType>>
));
}
template<
typename ElementA, typename LayoutA,
typename ElementB, typename LayoutB,
typename ElementC, typename LayoutC,
typename AccumulatorType, typename ComputeType,
typename InnerProductOp>
void bind_host_gemm(py::module &m) {
m.def("gemm", py::overload_cast<
cutlass::gemm::GemmCoord, ComputeType,
cutlass::TensorRef<ElementA, LayoutA>,
cutlass::TensorRef<ElementB, LayoutB>,
ComputeType,
cutlass::TensorRef<ElementC, LayoutC>,
cutlass::TensorRef<ElementC, LayoutC>,
AccumulatorType>(
&cutlass::reference::host::compute_gemm<
ElementA, LayoutA,
ElementB, LayoutB,
ElementC, LayoutC,
ComputeType,
AccumulatorType,
InnerProductOp,
cutlass::NumericConverter<ElementC, AccumulatorType>>
));
}
template<
typename ElementA, typename ElementB, typename ElementC,
typename AccumulatorType, typename ComputeType>
void bind_host_gemm_multiply_add(py::module &m) {
bind_host_gemm<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::RowMajor,
ComputeType, AccumulatorType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::RowMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::RowMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::RowMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
}
template<
typename ElementA, typename ElementB, typename ElementC,
typename AccumulatorType, typename ComputeType>
void bind_host_gemm_multiply_add_saturate(py::module &m) {
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::RowMajor,
ComputeType, AccumulatorType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::RowMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::RowMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::RowMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::RowMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajor,
ElementB, cutlass::layout::ColumnMajor,
ElementC, cutlass::layout::ColumnMajor,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
}
template<
typename ElementA, typename ElementB, typename ElementC,
typename AccumulatorType, typename ComputeType>
void bind_host_gemm_multiply_add_interleaved(py::module &m) {
bind_host_gemm<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
ComputeType, AccumulatorType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
}
template<
typename ElementA, typename ElementB, typename ElementC,
typename AccumulatorType, typename ComputeType>
void bind_host_gemm_multiply_add_saturate_interleaved(py::module &m) {
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
ComputeType, AccumulatorType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::RowMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::RowMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::RowMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
bind_host_gemm_saturate<
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
AccumulatorType, ComputeType,
cutlass::multiply_add<AccumulatorType>>(m);
}
#define BIND_TENSOR_EQUAL(Element, Layout) { \
m.def("equals", py::overload_cast< \
const cutlass::TensorView<Element, Layout>&, const cutlass::TensorView<Element, Layout>&>( \
&cutlass::reference::host::TensorEquals<Element, Layout>)); \
}
void bind_gemm_host_reference(py::module &m) {
/// double
bind_host_gemm_multiply_add<double, double, double, double, double>(m);
/// float
bind_host_gemm_multiply_add<float, float, float, float, float>(m);
/// half_t
bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t>(m);
bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, float>(m);
bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, cutlass::half_t>(m);
bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, float, float, float>(m);
/// bfloat16
bind_host_gemm_multiply_add<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, float>(m);
bind_host_gemm_multiply_add<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, float>(m);
/// s8
bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
// float
BIND_TENSOR_EQUAL(float, cutlass::layout::RowMajor);
BIND_TENSOR_EQUAL(float, cutlass::layout::ColumnMajor);
// double
BIND_TENSOR_EQUAL(double, cutlass::layout::RowMajor);
BIND_TENSOR_EQUAL(double, cutlass::layout::ColumnMajor);
// half_t
BIND_TENSOR_EQUAL(cutlass::half_t, cutlass::layout::RowMajor);
BIND_TENSOR_EQUAL(cutlass::half_t, cutlass::layout::ColumnMajor);
// bfloat16
BIND_TENSOR_EQUAL(cutlass::bfloat16_t, cutlass::layout::RowMajor);
BIND_TENSOR_EQUAL(cutlass::bfloat16_t, cutlass::layout::ColumnMajor);
// int32_t
BIND_TENSOR_EQUAL(int32_t, cutlass::layout::RowMajor);
BIND_TENSOR_EQUAL(int32_t, cutlass::layout::ColumnMajor);
// int8_t
BIND_TENSOR_EQUAL(int8_t, cutlass::layout::RowMajor);
BIND_TENSOR_EQUAL(int8_t, cutlass::layout::ColumnMajor);
BIND_TENSOR_EQUAL(int8_t, cutlass::layout::RowMajorInterleaved<32>);
BIND_TENSOR_EQUAL(int8_t, cutlass::layout::ColumnMajorInterleaved<32>);
}

View File

@ -1,55 +0,0 @@
import re
def SubstituteTemplate(template, values):
text = template
changed = True
while changed:
changed = False
for key, value in values.items():
regex = "\\$\\{%s\\}" % key
newtext = re.sub(regex, value, text)
if newtext != text:
changed = True
text = newtext
return text
from pycutlass.type_hint import *
from pycutlass.tensor_ref import *
from pycutlass.operation import *
from pycutlass.epilogue import *
from pycutlass.parser import *
from pycutlass.compiler import ArtifactManager
from pycutlass.memory_manager import *
from pycutlass.arguments import *
from pycutlass.library import *
from pycutlass.c_types import *
from pycutlass.gemm_operation import *
from pycutlass.conv2d_operation import *
from pycutlass.compiler import *
from pycutlass.utils import *
from pycutlass.frontend import *
from pycutlass.reduction_operation import *
from pycutlass.compiler import *
from pycutlass.utils.device import device_cc
# module-wide variables
import sys
this = sys.modules[__name__]
# artifact manager
this.compiler = ArtifactManager()
try:
if not hasattr(this, 'DEVICE_CC') or this.DEVICE_CC is None:
this.DEVICE_CC = device_cc()
except:
this.DEVICE_CC = None
def get_memory_pool(init_pool_size=0, max_pool_size=2**34):
this.memory_pool = PoolMemoryManager(
init_pool_size=init_pool_size,
max_pool_size=max_pool_size
)
return this.memory_pool

View File

@ -1,118 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from .frontend import CupyFrontend
from typeguard import typechecked
from pycutlass.frontend import *
from typing import Union
import numpy as np
from cuda import cuda
try:
import torch
torch_available = True
except ImportError:
torch_available = False
from cuda import cudart
try:
import cupy as cp
cupy_available = True
except ImportError:
cupy_available = False
# @typechecked
class ArgumentBase:
"""
Base class for operation arguments
"""
def __init__(self,
A: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
B: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
C: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
D: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
**kwargs) -> None:
# tensor_C can be interpreted as the bias with bias=True in keyword args
if "bias" in kwargs.keys():
self.bias = kwargs["bias"]
else:
# by default, tensor_C is not bias
self.bias = False
# preprocessing input tensors
if isinstance(A, np.ndarray):
self.host_D = D
self.buffer_A = NumpyFrontend.argument(A, False)
self.buffer_B = NumpyFrontend.argument(B, False)
self.buffer_C = NumpyFrontend.argument(C, False)
self.buffer_D = NumpyFrontend.argument(D, True)
self.ptr_A = self.buffer_A.ptr
self.ptr_B = self.buffer_B.ptr
self.ptr_C = self.buffer_C.ptr
self.ptr_D = self.buffer_D.ptr
# number of elements in C
self.tensor_c_numel = C.size
elif torch_available and isinstance(A, torch.Tensor):
self.ptr_A = TorchFrontend.argument(A)
self.ptr_B = TorchFrontend.argument(B)
self.ptr_C = TorchFrontend.argument(C)
self.ptr_D = TorchFrontend.argument(D)
# number of elements in C
self.tensor_c_numel = C.numel()
elif isinstance(A, cuda.CUdeviceptr):
self.ptr_A = A
self.ptr_B = B
self.ptr_C = C
self.ptr_D = D
elif cupy_available and isinstance(A, cp.ndarray):
self.ptr_A = CupyFrontend.argument(A)
self.ptr_B = CupyFrontend.argument(B)
self.ptr_C = CupyFrontend.argument(C)
self.ptr_D = CupyFrontend.argument(D)
# number of elements in C
self.tensor_c_numel = C.size
else:
raise TypeError(
"Unsupported Frontend. Only support numpy and torch")
def sync(self, stream_sync=True):
if stream_sync:
err, = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
if hasattr(self, "host_D"):
err, = cuda.cuMemcpyDtoH(
self.host_D, self.ptr_D, self.host_D.size * self.host_D.itemsize)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))

View File

@ -1,395 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Utilities for stamping out collective mainloops for SM90 kernels
"""
import cute
import cutlass
from pycutlass import SubstituteTemplate
import pycutlass.library as library
tma_alignment_bytes = 16
cp_async_min_alignment_bytes = 4
class RowColMajorToGMMAMajor:
@staticmethod
def A(layout, element):
"""
Converts operand A's layout from row/column major format into CuTe's GMMA major format
:param layout: layout of the A operand
:type layout: cutlass.RowMajor or cutlass.ColumnMajor
:param element: data type of the A operand
:return: C++ CuTe GMMA major format
:rtype: cute.GMMAMajor
"""
type_requires_k_major = (element == cutlass.tfloat32) or (element == cutlass.int8)
if layout == cutlass.ColumnMajor and not type_requires_k_major:
return cute.GMMAMajor.MN
else:
return cute.GMMAMajor.K
@staticmethod
def B(layout, element):
"""
Converts operand B's layout from row/column major format into CuTe's GMMA major format
:param layout: layout of the B operand
:type layout: cutlass.RowMajor or cutlass.ColumnMajor
:param element: data type of the B operand
:return: C++ CuTe GMMA major format
:rtype: cute.GMMAMajor
"""
type_requires_k_major = (element == cutlass.tfloat32) or (element == cutlass.int8)
if layout == cutlass.RowMajor and not type_requires_k_major:
return cute.GMMAMajor.MN
else:
return cute.GMMAMajor.K
def cluster_shape_to_tma(dim):
"""
Returns the TMA copy type for a given cluster dimension
:param dim: a given dimension of a cluster
:type dim: layout
:return: C++ TMA copy time
:rtype: str
"""
return 'cute::SM90_TMA_LOAD' if dim == 1 else 'cute::SM90_TMA_LOAD_MULTICAST'
def make_cpasync_gmem_tiled_copy(thread_count, element, alignment, gmma_layout, dim_mn, dim_k):
"""
Returns a `make_tiled_copy` call for a given configuration
:param thread_count: number of threads in the threadblock
:type thread_count: int
:param element: datatype of the operand in question
:param alignment: byte alignment of the operand in question
:type alignment: int
:param gmma_layout: GMMA layout of the operand in question
:type gmma_layout: cute.GMMAMajor
:param dim_mn: extent of the M/N dimension of the tile
:type dim_mn: int
:param dim_k: extent of the reduction dimension of the tile
:type dim_k: int
:return: C++ call to `make_tiled_copy`
:rtype: str
"""
emission_str = """decltype(cute::make_tiled_copy(
cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint_byte_t<static_cast<int>(sizeof(${element})) * ${alignment}>>, ${element}>{},
cute::Layout<cute::Shape<_${shape0_x}, _${shape0_y}>,
cute::Stride<_${stride_x}, _${stride_y}>>{},
cute::Layout<cute::Shape<_${shape1_x}, _${shape1_y}>>{}))"""
if gmma_layout == cute.GMMAMajor.K:
threads_major = dim_k // alignment
threads_minor = thread_count // threads_major
values = {
'shape0_x': str(threads_minor),
'shape0_y': str(threads_major),
'stride_x': str(threads_major),
'stride_y': '1',
'shape1_x': '1',
'shape1_y': str(alignment)
}
elif gmma_layout == cute.GMMAMajor.MN:
threads_major = dim_mn // alignment
threads_minor = thread_count // threads_major
values = {
'shape0_x': str(threads_major),
'shape0_y': str(threads_minor),
'stride_x': '1',
'stride_y': str(threads_major),
'shape1_x': str(alignment),
'shape1_y': '1'
}
else:
raise Exception('Unexpected GMMA layout {}'.format(gmma_layout))
# Add common values
values['element'] = library.DataTypeTag[element]
values['alignment'] = str(alignment)
return SubstituteTemplate(emission_str, values)
def max_stages(op, arch):
"""
Returns the maximum number pipeline stages that can be used for an operation.
:param op: operation for which the maximum stages should be computed. If stages are
set via the `op.tile_description.stages` parameter, this setting is ignored
in the present calculation
:type op: pycutlass.GemmOperation
:param arch: compute capability of the device on which the operation will be run
:type arch: int
:return: maximum number of pipeline stages that can be used for an operation
:rtype: int
"""
smem_per_stage = library.CalculateSmemUsagePerStage(op)
smem_capacity = library.SharedMemPerCC[arch]
return int(smem_capacity // smem_per_stage)
class LayoutToStride:
_variable_first = 'cute::Stride<int64_t, cute::Int<1>, int64_t>'
_variable_last = 'cute::Stride<cute::Int<1>, int64_t, int64_t>'
@staticmethod
def A(layout):
"""
Returns the CuTe shape type corresponding to the layout of operand A
:param layout: layout of the B operand
:type layout: cutlass.RowMajor or cutlass.ColumnMajor
:return: C++ declaration of CuTe stride
:rtype: str
"""
if layout == cutlass.RowMajor:
return LayoutToStride._variable_first
elif layout == cutlass.ColumnMajor:
return LayoutToStride._variable_last
else:
raise Exception('Unsupported layout {}'.format(layout))
@staticmethod
def B(layout):
"""
Returns the CuTe shape type corresponding to the layout of operand B
:param layout: layout of the B operand
:type layout: cutlass.RowMajor or cutlass.ColumnMajor
:return: C++ declaration of CuTe stride
:rtype: str
"""
if layout == cutlass.RowMajor:
return LayoutToStride._variable_last
elif layout == cutlass.ColumnMajor:
return LayoutToStride._variable_first
else:
raise Exception('Unsupported layout {}'.format(layout))
EMISSION_STR = """
using TileShape_MNK = cute::Shape<_${threadblock_shape_m}, _${threadblock_shape_n}, _${threadblock_shape_k}>;
using ClusterShape_MNK = cute::Shape<_${cluster_shape_m}, _${cluster_shape_n}, _${cluster_shape_k}>;
using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
${internal_element_A}, ${internal_element_B}, ${element_accumulator}, TileShape_MNK, ${gmma_layout_A}, ${gmma_layout_B}>()));
using SmemLayoutAtomA = decltype(cute::GMMA::smem_selector<${gmma_layout_A}, ${internal_element_A}, _${threadblock_shape_m}, _${threadblock_shape_k}>());
using SmemLayoutAtomB = decltype(cute::GMMA::smem_selector<${gmma_layout_B}, ${internal_element_B}, _${threadblock_shape_n}, _${threadblock_shape_k}>());
using CollectiveOp = typename cutlass::gemm::collective::CollectiveMma<
${mainloop_type}<${stage_count}, ClusterShape_MNK${kernel_schedule}>,
TileShape_MNK,
${element_A},
${stride_A},
${element_B},
${stride_B},
TiledMma,
${gmem_tiled_copy_A},
SmemLayoutAtomA,
void, // GMMA_SS does not need an SmemCopyAtom
${transform_A},
${gmem_tiled_copy_B},
SmemLayoutAtomB,
void, // GMMA_SS does not need an SmemCopyAtom
${transform_B}
>;
"""
def internal_element(element):
"""
Returns the data type internally used for `element`.
:param element: data type
:return: data type used internally
"""
return cutlass.tfloat32 if element == cutlass.float32 else element
def common_values(op, stage_count, transform_A, transform_B):
"""
Returns a dictionary containing common values to be substituted in the emission of the
collective operation declaration. Values specific to a particular collective operation
should be added to these.
:param op: GEMM operation for which to build a collective operation
:type op: pycutlass.GemmOperation
:param stage_count: number of pipeline stages to use in the operation
:type stage_count: int
:param transform_A: transformation to perform on the A operand
:type transform_A: str
:param transform_B: transformation to perform on the B operand
:type transform_B: str
:return: dictionary containing values to substitute in emission string
:rtype: dict
"""
internal_element_a = internal_element(op.A.element)
internal_element_b = internal_element(op.B.element)
return {
'threadblock_shape_m': str(op.tile_description.threadblock_shape[0]),
'threadblock_shape_n': str(op.tile_description.threadblock_shape[1]),
'threadblock_shape_k': str(op.tile_description.threadblock_shape[2]),
'cluster_shape_m': str(op.tile_description.cluster_shape[0]),
'cluster_shape_n': str(op.tile_description.cluster_shape[1]),
'cluster_shape_k': str(op.tile_description.cluster_shape[2]),
'element_A': library.DataTypeTag[op.A.element],
'element_B': library.DataTypeTag[op.B.element],
'internal_element_A': library.DataTypeTag[internal_element_a],
'internal_element_B': library.DataTypeTag[internal_element_b],
'element_accumulator': library.DataTypeTag[op.accumulator_type()],
'gmma_layout_A': library.CuTeLayoutTag[RowColMajorToGMMAMajor.A(op.A.layout, internal_element_a)],
'gmma_layout_B': library.CuTeLayoutTag[RowColMajorToGMMAMajor.B(op.B.layout, internal_element_b)],
'stride_A': LayoutToStride.A(op.A.layout),
'stride_B': LayoutToStride.B(op.B.layout),
'stage_count': str(stage_count),
'transform_A': transform_A,
'transform_B': transform_B
}
def build_gmma_tma(op):
"""
Builds a collective operation declaration targeting TMA GMMA kernels
:param op: GEMM operation for which to build a collective operation
:type op: pycutlass.GemmOperation
:return: string containing the C++ declaration of collective operation
:rtype: str
"""
A_tma_aligned = (library.DataTypeSizeBytes[op.A.element] * op.A.alignment) % tma_alignment_bytes == 0
B_tma_aligned = (library.DataTypeSizeBytes[op.B.element] * op.B.alignment) % tma_alignment_bytes == 0
if not A_tma_aligned or not B_tma_aligned:
raise Exception('Each of the A or B operands must be aligned to {} bytes to use TMA'.format(tma_alignment_bytes))
max_stage_count = max_stages(op, arch=90)
if op.tile_description.stages is None:
op.tile_description.stages = max_stage_count
elif op.tile_description.stages > max_stage_count:
raise Exception('Combination of threadblock shape, data types, and number of stages exceeds shared memory capacity.')
kernel_schedule = 'cutlass::gemm::KernelTmaWarpSpecialized'
if op.tile_description.persistent:
kernel_schedule = 'cutlass::gemm::KernelTmaWarpSpecializedPersistent'
transform_A = 'cute::identity'
transform_B = 'cute::identity'
values = common_values(op, op.tile_description.stages, transform_A, transform_B)
specific_values = {
'mainloop_type': 'cutlass::gemm::MainloopSm90TmaGmmaWarpSpecialized',
'kernel_schedule': ', ' + kernel_schedule,
'gmem_tiled_copy_A': cluster_shape_to_tma(op.tile_description.cluster_shape[1]),
'gmem_tiled_copy_B': cluster_shape_to_tma(op.tile_description.cluster_shape[0])
}
values.update(specific_values)
return SubstituteTemplate(EMISSION_STR, values)
def build_gmma_cpasync(op):
"""
Builds a collective operation declaration targeting cp.async GMMA kernels
:param op: GEMM operation for which to build a collective operation
:type op: pycutlass.GemmOperation
:return: string containing the C++ declaration of collective operation
:rtype: str
"""
A_cp_async_aligned = (library.DataTypeSizeBytes[op.A.element] * op.A.alignment) % cp_async_min_alignment_bytes == 0
B_cp_async_aligned = (library.DataTypeSizeBytes[op.B.element] * op.B.alignment) % cp_async_min_alignment_bytes == 0
if not A_cp_async_aligned or not B_cp_async_aligned:
raise Exception('Each of the A or B operands must be aligned to {} bytes to use cp.async'.format(cp_async_min_alignment_bytes))
max_stage_count = max_stages(op, arch=90)
if op.tile_description.stages is None:
op.tile_description.stages = max_stage_count
elif op.tile_description.stages > max_stage_count:
raise Exception('Combination of threadblock shape, data types, and number of stages exceeds shared memory capacity.')
transform_A = 'cute::identity'
transform_B = 'cute::identity'
thread_count = 128
cpasync_copy_A = make_cpasync_gmem_tiled_copy(thread_count, op.A.element, op.A.alignment, RowColMajorToGMMAMajor.A(op.A.layout, op.A.element),
op.tile_description.threadblock_shape[0], op.tile_description.threadblock_shape[2])
cpasync_copy_B = make_cpasync_gmem_tiled_copy(thread_count, op.B.element, op.B.alignment, RowColMajorToGMMAMajor.B(op.B.layout, op.B.element),
op.tile_description.threadblock_shape[1], op.tile_description.threadblock_shape[2])
values = common_values(op, op.tile_description.stages, transform_A, transform_B)
specific_values = {
'mainloop_type': 'cutlass::gemm::MainloopSm90CpAsyncGmma',
'kernel_schedule': '',
'gmem_tiled_copy_A': cpasync_copy_A,
'gmem_tiled_copy_B': cpasync_copy_B
}
values.update(specific_values)
return SubstituteTemplate(EMISSION_STR, values)
def build(operation):
"""
Builds a collective operation declaration targeting cp.async or TMA for GMMA kernels
:param operation: GEMM operation for which to build a collective operation
:type operation: pycutlass.GemmOperation
:return: string containing the C++ declaration of collective operation
:rtype: str
"""
A_tma_aligned = (library.DataTypeSizeBytes[operation.A.element] * operation.A.alignment) % tma_alignment_bytes == 0
B_tma_aligned = (library.DataTypeSizeBytes[operation.B.element] * operation.B.alignment) % tma_alignment_bytes == 0
tma_correct_size = (library.DataTypeSizeBytes[operation.A.element] == 2 and library.DataTypeSizeBytes[operation.B.element] == 2)
tma_correct_layout = (operation.A.layout == cutlass.RowMajor or operation.B.layout == cutlass.ColumnMajor)
if A_tma_aligned and B_tma_aligned and (tma_correct_size or tma_correct_layout):
return build_gmma_tma(operation)
else:
return build_gmma_cpasync(operation)

View File

@ -1,279 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import ctypes
from pycutlass.library import *
class GemmCoord_(ctypes.Structure):
_fields_ = [
("m", ctypes.c_int),
("n", ctypes.c_int),
("k", ctypes.c_int)
]
def __init__(self, gemm_coord) -> None:
for field_name, _ in self._fields_:
setattr(self, field_name, getattr(gemm_coord, field_name)())
class GemmCoordBatched_(ctypes.Structure):
"""
Wrapper around a GemmCoord that also contains batch count. This is used for encoding
batched GEMM inputs to CUTLASS 3 GEMMs.
"""
_fields_ = [
("m", ctypes.c_int),
("n", ctypes.c_int),
("k", ctypes.c_int),
("batch_count", ctypes.c_int)
]
def __init__(self, gemm_coord, batch_count) -> None:
for field_name, _ in self._fields_[:-1]:
setattr(self, field_name, getattr(gemm_coord, field_name)())
setattr(self, "batch_count", batch_count)
class MatrixCoord_(ctypes.Structure):
_fields_ = [
("row", ctypes.c_int),
("column", ctypes.c_int)
]
class dim3_(ctypes.Structure):
_fields_ = [
("x", ctypes.c_int),
("y", ctypes.c_int),
("z", ctypes.c_int)
]
class StrideBatched_(ctypes.Structure):
"""
CUTLASS 3.0 strides for operands contain one static dimension and two variable dimensions. The
variable dimensions represent the stride along non-unit-stride dimension of the row/column major
layout, and the batch stride. This structure encodes the two variable dimensions.
"""
_fields_ = [
("major_stride", ctypes.c_int64),
("batch_stride", ctypes.c_int64)
]
dtype2ctype = {
cutlass.float16: ctypes.c_uint16,
cutlass.float32: ctypes.c_float,
cutlass.float64: ctypes.c_double,
cutlass.int32: ctypes.c_int32
}
def get_gemm_arguments_3x(epilogue_functor):
_EpilogueOutputOpParams = epilogue_functor.epilogue_type
class _GemmArguments(ctypes.Structure):
_fields_ = [
("mode", ctypes.c_int),
("problem_size", GemmCoordBatched_),
("ptr_A", ctypes.c_void_p),
("stride_A", StrideBatched_),
("ptr_B", ctypes.c_void_p),
("stride_B", StrideBatched_),
("ptr_C", ctypes.c_void_p),
("stride_C", StrideBatched_),
("ptr_D", ctypes.c_void_p),
("stride_D", StrideBatched_),
("epilogue", _EpilogueOutputOpParams),
]
return _GemmArguments, _EpilogueOutputOpParams
def get_gemm_arguments(epilogue_functor):
_EpilogueOutputOpParams = epilogue_functor.epilogue_type
class _GemmArguments(ctypes.Structure):
_fields_ = [
# Arguments from UniversalArgumentsBase
("mode", ctypes.c_int),
("problem_size", GemmCoord_),
("batch_count", ctypes.c_int),
("batch_stride_D", ctypes.c_longlong),
# Remaining arguments
("epilogue", _EpilogueOutputOpParams),
("ptr_A", ctypes.c_void_p),
("ptr_B", ctypes.c_void_p),
("ptr_C", ctypes.c_void_p),
("ptr_D", ctypes.c_void_p),
("batch_stride_A", ctypes.c_longlong),
("batch_stride_B", ctypes.c_longlong),
("batch_stride_C", ctypes.c_longlong),
("stride_a", ctypes.c_longlong),
("stride_b", ctypes.c_longlong),
("stride_c", ctypes.c_longlong),
("stride_d", ctypes.c_longlong),
("lda", ctypes.c_longlong),
("ldb", ctypes.c_longlong),
("ldc", ctypes.c_longlong),
("ldd", ctypes.c_longlong),
("ptr_gather_A_indices", ctypes.c_void_p),
("ptr_gather_B_indices", ctypes.c_void_p),
("ptr_scatter_D_indices", ctypes.c_void_p)
]
return _GemmArguments, _EpilogueOutputOpParams
###########################################################################################
# GEMM Grouped
###########################################################################################
def get_gemm_grouped_arguments(epilogue_functor):
_EpilogueOutputOpParams = epilogue_functor.epilogue_type
class _GEMMGroupedArguments(ctypes.Structure):
_fields_ = [
("problem_sizes", ctypes.c_void_p),
("problem_count", ctypes.c_int),
("threadblock_count", ctypes.c_int),
("output_op", _EpilogueOutputOpParams),
("ptr_A", ctypes.c_void_p),
("ptr_B", ctypes.c_void_p),
("ptr_C", ctypes.c_void_p),
("ptr_D", ctypes.c_void_p),
("lda", ctypes.c_void_p),
("ldb", ctypes.c_void_p),
("ldc", ctypes.c_void_p),
("ldd", ctypes.c_void_p),
("host_problem_sizes", ctypes.c_void_p)
]
return _GEMMGroupedArguments, _EpilogueOutputOpParams
############################################################################################
# Convolution2D
############################################################################################
class Conv2DProblemSize(ctypes.Structure):
_fields_ = [
("N", ctypes.c_int),
("H", ctypes.c_int),
("W", ctypes.c_int),
("C", ctypes.c_int),
("P", ctypes.c_int),
("Q", ctypes.c_int),
("K", ctypes.c_int),
("R", ctypes.c_int),
("S", ctypes.c_int),
("pad_h", ctypes.c_int),
("pad_w", ctypes.c_int),
("stride_h", ctypes.c_int),
("stride_w", ctypes.c_int),
("dilation_h", ctypes.c_int),
("dilation_w", ctypes.c_int),
("mode", ctypes.c_int), # kCrossCorrelation: 0, kConvolution: 1
("split_k_slices", ctypes.c_int),
("groups", ctypes.c_int)
]
def __init__(self, problem_size) -> None:
for field_name, _ in self._fields_:
setattr(self, field_name, getattr(problem_size, field_name))
class Layout4D(ctypes.Structure):
_fields_ = [
("stride", ctypes.c_int * 3)
]
def __init__(self, tensor_ref):
stride = tensor_ref.stride()
setattr(self, "stride", (stride.at(0), stride.at(1), stride.at(2)))
class TensorRef_(ctypes.Structure):
_fields_ = [
("ptr", ctypes.c_void_p),
("layout", Layout4D)
]
def __init__(self, tensor_ref):
setattr(self, "ptr", tensor_ref.data())
setattr(self, "layout", Layout4D(tensor_ref.layout()))
class TensorRef2D_(ctypes.Structure):
_fields_ = [
("ptr", ctypes.c_void_p),
("stride", ctypes.c_int)
]
def get_conv2d_arguments(epilogue_functor):
_EpilogueOutputOpParams = epilogue_functor.epilogue_type
class _Conv2dArguments(ctypes.Structure):
_fields_ = [
("problem_size", Conv2DProblemSize), # 0
("ref_A", TensorRef_), # 72
("ref_B", TensorRef_), # 96
("ref_C", TensorRef_), # 120
("ref_D", TensorRef_), # 144
("output_op", _EpilogueOutputOpParams), # 168
("split_k_mode", ctypes.c_int) # 192
]
return _Conv2dArguments, _EpilogueOutputOpParams
############################################################################################
# Reduction
############################################################################################
def get_reduction_params(epilogue_functor):
_EpilogueOutputParams = epilogue_functor.epilogue_type
class _ReductionParams(ctypes.Structure):
_fields_ = [
("problem_size", MatrixCoord_),
("partitions", ctypes.c_int),
("partition_stride", ctypes.c_longlong),
("workspace", TensorRef2D_),
("destination", TensorRef2D_),
("source", TensorRef2D_),
("output_op", _EpilogueOutputParams)
]
return _ReductionParams, _EpilogueOutputParams

View File

@ -1,460 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import pycutlass
from pycutlass import *
import cutlass
from cuda import cuda
from cuda import nvrtc
import tempfile
import os
import ctypes
#
import json
import sqlite3
IncludeTemplate = r'''#include "${include}"
'''
#
class CompilationOptions:
'''
Compilation options.
'''
#
def __init__(self, flags, arch, include_paths=[]):
self.includes = []
self.include_paths = include_paths
self.flags = flags
self.arch = arch
def get_str(self):
options = ""
for flag in self.flags:
options += " " + flag
for incl in self.include_paths:
options += ' --include-path=%s' % incl
arch_flag = " -arch=sm_%d" % self.arch
if self.arch == 90:
arch_flag += 'a'
options += arch_flag
return options
#
def get(self):
options = []
for flag in self.flags:
options.append(bytes(str.encode(flag)))
for incl in self.include_paths:
options.append(bytes(str.encode('--include-path=%s' % incl)))
arch_flag = " -arch=sm_%d" % self.arch
if self.arch == 90:
arch_flag += 'a'
options.append(bytes(str.encode(arch_flag)))
return options
def convertToBinaryData(filename):
with open(filename, 'rb') as file:
blobData = file.read()
return blobData
def CDLLBin(host_binary):
tempfile.tempdir = "./"
temp_so = tempfile.NamedTemporaryFile(
prefix='host_func', suffix='.so', delete=True)
with open(temp_so.name, 'wb') as file:
file.write(host_binary)
host_lib = ctypes.CDLL(temp_so.name)
return host_lib
class ArtifactManager:
"""
Artifact manager
"""
def __init__(self) -> None:
try:
connection = sqlite3.connect("./compiled_cache.db")
cursor = connection.cursor()
sqlite_create_table_query = """CREATE TABLE compiled_operations(op_key TEXT NOT NULL UNIQUE, cubin BLOB NOT NULL, hostbin BLOB NOT NULL, op_name TEXT NOT NULL, op_attrs TEXT NOT NULL)"""
cursor.execute(sqlite_create_table_query)
connection.commit()
cursor.close()
except:
pass
self.nvcc()
self.compiled_cache_device = cutlass.CompileCache()
self.compiled_cache_host = cutlass.CompileCache()
def nvrtc(self):
self.backend = "nvrtc"
self.default_compile_options = [
'-std=c++17', '-default-device'
]
def nvcc(self):
self.backend = "nvcc"
self.default_compile_options = [
'-std=c++17', '--expt-relaxed-constexpr', '-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored'
]
def insert_operation(self, op_key, cubin, hostfile, op_name, op_attrs):
connection = sqlite3.connect("./compiled_cache.db")
cursor = connection.cursor()
sqlite_insert_blob_query = """ INSERT OR IGNORE INTO compiled_operations (op_key, cubin, hostbin, op_name, op_attrs) VALUES (?, ?, ?, ?, ?)"""
hostbin = convertToBinaryData(hostfile)
data_tuple = (op_key, cubin, hostbin, op_name, json.dumps(op_attrs))
cursor.execute(sqlite_insert_blob_query, data_tuple)
connection.commit()
cursor.close()
def load_operation(self, op_key, extra_funcs):
connection = sqlite3.connect("./compiled_cache.db")
cursor = connection.cursor()
sqlite_fetch_blob_query = """SELECT * from compiled_operations where op_key = ?"""
# try:
cursor.execute(sqlite_fetch_blob_query, (op_key, ))
record = cursor.fetchall()
if len(record) == 0:
return False
for row in record:
key, cubin_image, host_binary, operation_name, op_attr = row
op_attr = json.loads(op_attr)
err, module = cuda.cuModuleLoadData(cubin_image)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
err, kernel = cuda.cuModuleGetFunction(
module, bytes(str.encode(operation_name)))
self.compiled_cache_device.insert(key, kernel)
compiled_host_fns = {}
host_lib = CDLLBin(host_binary)
func_name = operation_name + '_get_params'
func = getattr(host_lib, func_name)
func.restype = ctypes.POINTER(ctypes.c_char * op_attr[0])
compiled_host_fns['get_args'] = func
func_name = operation_name + '_shared_memory_size'
func = getattr(host_lib, func_name)
compiled_host_fns['shared_memory_capacity'] = func()
for attr in op_attr:
if isinstance(attr, str):
func_name = operation_name + '_' + attr
func = getattr(host_lib, func_name)
# Set the return type of the function
if attr in extra_funcs and extra_funcs[attr] != None:
func.restype = extra_funcs[attr]
compiled_host_fns[attr] = func
self.compiled_cache_host.insert(key, compiled_host_fns)
return True
def emit_compile_(self, operation_list, compilation_options, requires_nvcc_hostlib_compilation):
"""
Compile a list of kernels and store them into database
"""
source_buffer_device = ""
source_buffer_host = ""
# 1. include
includes = []
for operation in operation_list:
for incl in operation.emitter.includes:
if incl not in includes:
includes.append(incl)
includes_host = [
"builtin_types.h", "device_launch_parameters.h", "stddef.h"] + includes
for incl in includes:
source_buffer_device += SubstituteTemplate(
IncludeTemplate, {'include': incl})
for incl in includes_host:
if "/device/" not in incl:
source_buffer_host += SubstituteTemplate(
IncludeTemplate, {'include': incl})
# 2. Operations
for operation in operation_list:
source_buffer_device += operation.emit()
source_buffer_host += operation.emit()
values = {
'operation_name': operation.name(),
'operation_suffix': operation.emitter.operation_suffix
}
source_buffer_device += SubstituteTemplate(
operation.KernelTemplate, values)
source_buffer_host += SubstituteTemplate(
operation.HostTemplate, values)
if self.backend == "nvrtc":
# 3. compile
err, program = nvrtc.nvrtcCreateProgram(
str.encode(source_buffer_device),
bytes(str.encode("module.cu")),
0, [], [])
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
# Compile program
options = compilation_options.get()
err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
error_string = 'NVRTC Error: {}\n'.format(err)
# Get log from compilation
err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
log = b' ' * logSize
err, = nvrtc.nvrtcGetProgramLog(program, log)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
raise RuntimeError(
error_string + log.decode() + source_buffer_device)
# Get data from compilation
err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
cubin_image = b' ' * dataSize
err, = nvrtc.nvrtcGetCUBIN(program, cubin_image)
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
raise RuntimeError('NVRTC Error: {}'.format(err))
else: # with nvcc backend
# emit code
tempfile.tempdir = "./"
temp_cu = tempfile.NamedTemporaryFile(
prefix='kernel', suffix='.cu', delete=True)
temp_cubin = tempfile.NamedTemporaryFile(
prefix='kernel', suffix='.cubin', delete=True)
with open(temp_cu.name, 'w') as file:
file.write(source_buffer_device)
# compile with nvcc
cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
cmd_template = "${cuda_install_path}/bin/nvcc ${options} -cubin ${srcfile} -o ${tarfile}"
values = {
"cuda_install_path": cuda_install_path,
"options": compilation_options.get_str(),
"srcfile": temp_cu.name,
"tarfile": temp_cubin.name
}
cmd = SubstituteTemplate(cmd_template, values)
os.system(cmd)
# load the cubin image
with open(temp_cubin.name, 'rb') as file:
cubin_image = file.read()
# Set up the host-side library code
if requires_nvcc_hostlib_compilation:
cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
cmd_template = "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}" % source_buffer_host
cmd = SubstituteTemplate(
cmd_template,
{
"cuda_install_path": cuda_install_path,
"options": compilation_options.get_str()
})
else:
options = compilation_options.get()
cmd = "echo '%s'|g++ -x c++ -fpermissive -w -fPIC" % source_buffer_host
filtered_opts = ['-default-device', '-Xcicc', '-Xllc', '--expt-relaxed-constexpr', '-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored']
for opt in options:
opt = opt.decode("utf-8")
if opt not in filtered_opts and '-arch=sm_' not in opt:
if '--include-path=' in opt:
cmd += " " + opt.replace('--include-path=', '-I')
else:
cmd += " " + opt
tempfile.tempdir = "./"
temp = tempfile.NamedTemporaryFile(
prefix='host_func', suffix='.so', delete=True)
cmd += ' - -shared -o %s -lcudart -lcuda' % temp.name
os.system(cmd)
host_lib = ctypes.CDLL(temp.name)
return cubin_image, host_lib, temp
def add_module(self, operations, compile_options=None):
"""
Insert a new compiled device module
"""
if compile_options is None:
cutlass_path = os.getenv('CUTLASS_PATH')
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
include_paths = [
cuda_install_path + '/include',
cutlass_path + '/include',
cutlass_path + '/tools/util/include',
cutlass_path + '/tools/library/scripts/pycutlass/src/cpp/include'
]
if pycutlass.DEVICE_CC is not None:
arch = pycutlass.DEVICE_CC
else:
# Find the maximum arch tag among the provided operations and compile for that target.
# Since we are compiling to .cubin files, only one architecture may be specified.
arch = max([op.arch for op in operations])
compile_options = CompilationOptions(
self.default_compile_options, arch, include_paths)
# save the cubin
operation_key = []
operation_list = []
requires_nvcc_hostlib_compilation = False
for operation in operations:
# step 1: get kernel string as key
key = operation.rt_module.emit() + operation.procedural_name() + self.backend
# step 1: check if the operation is in cache
compiled_kernel = self.compiled_cache_device.at(key)
if compiled_kernel is None:
hit = self.load_operation(key, getattr(operation.rt_module, 'extra_funcs', {}))
if hit:
compiled_kernel = self.compiled_cache_device.at(key)
assert compiled_kernel is not None
if compiled_kernel is not None:
operation.rt_module.kernel = compiled_kernel
compiled_host_fns = self.compiled_cache_host.at(key)
assert compiled_host_fns is not None
for key in compiled_host_fns.keys():
setattr(operation.rt_module, key, compiled_host_fns[key])
operation.rt_module.initialize()
else:
operation_list.append(operation.rt_module)
operation_key.append(key)
# Creating the Params structures for certain 3.0 kernels currently requires CUDA. For these cases, use NVCC to generate
# the PyCUTLASS host-side library. Otherwise, g++ will be used.
if isinstance(operation, pycutlass.gemm_operation.GemmOperationUniversal) and operation.api == pycutlass.library.ApiVersion.v3x:
if self.backend == "nvrtc":
raise RuntimeError('CUTLASS 3 kernels currently require NVCC for compilation.')
requires_nvcc_hostlib_compilation = True
if len(operation_list) > 0:
cubin_image, host_lib, host_file = self.emit_compile_(
operation_list, compile_options, requires_nvcc_hostlib_compilation)
err, module = cuda.cuModuleLoadData(cubin_image)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
operation_name = []
operation_attr = []
for operation, key in zip(operation_list, operation_key):
# get device kernels
err, operation.kernel = cuda.cuModuleGetFunction(
module,
bytes(str.encode(operation.name()))
)
operation_name.append(operation.name())
self.compiled_cache_device.insert(key, operation.kernel)
# get host functions
compiled_host_fns = {}
op_attr = []
# get param size
func_name = operation.name() + '_get_param_size'
func = getattr(host_lib, func_name)
param_size = func()
func_name = operation.name() + '_get_params'
func = getattr(host_lib, func_name)
func.argtype = operation.argtype
func.restype = ctypes.POINTER(ctypes.c_char * param_size)
setattr(operation, 'get_args', func)
compiled_host_fns['get_args'] = func
# set shared memory size
func_name = operation.name() + '_shared_memory_size'
func = getattr(host_lib, func_name)
setattr(operation, 'shared_memory_capacity', func())
compiled_host_fns['shared_memory_capacity'] = func()
# set the maximum dynamic shared size
operation.initialize()
# get extra functions
op_attr.append(param_size)
if hasattr(operation, "extra_funcs"):
for suffix, ret_type in operation.extra_funcs.items():
func_name = operation.name() + '_' + suffix
func = getattr(host_lib, func_name)
if ret_type is not None:
func.restype = ret_type
setattr(operation, suffix, func)
compiled_host_fns[suffix] = func
op_attr.append(suffix)
operation_attr.append(op_attr)
self.compiled_cache_host.insert(key, compiled_host_fns)
for key, operation_name, operation_attr in zip(operation_key, operation_name, operation_attr):
self.insert_operation(
key, cubin_image, host_file.name, operation_name, operation_attr)

View File

@ -1,632 +0,0 @@
################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
from typeguard import typechecked
from cuda import cuda
from typing import Union
import numpy as np
from typeguard import typechecked
from pycutlass import *
# @typechecked
class Conv2dArguments(ArgumentBase):
"""
Argument wrapper for Conv2d. It encodes problem information and
user-provide tensors into the kernel's argument.
:param operation: the Conv2d operation to take the argument
:type operation: :class:`pycutlass.Conv2dOperation`
:param problem_size: the Conv2d problem size
:type problem_size: :class:`cutlass.conv.Conv2dProblemSize`
:param A: tensor A
:type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
:param B: tensor B
:type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
:param C: tensor C
:type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
:param D: tensor D
:type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
:param split_k_mode: conv2d split K mode, defaults to
cutlass.conv.SplitKMode.Serial
:type split_k_mode: cutlass.conv.SplitKMode, optional
:param output_op: output operator, optional
:type output_op: :class:`pycutlass.LinearCombinationFunctorArguments`
"""
def __init__(self, operation: 'Conv2dOperation',
problem_size: 'cutlass.conv.Conv2dProblemSize',
A: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
B: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
C: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
D: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
split_k_mode: 'cutlass.conv.SplitKMode'
= cutlass.conv.SplitKMode.Serial, **kwargs) -> None:
self.operation = operation
#: convolution kind
self.conv_kind: cutlass.conv.Operator = operation.conv_kind
self.layout_A: cutlass.layout = operation.A.layout
self.layout_B: cutlass.layout = operation.B.layout
self.layout_C: cutlass.layout = operation.C.layout
self.element_A = operation.A.element
self.element_B = operation.B.element
self.element_C = operation.C.element
if self.layout_C == cutlass.TensorNC32HW32:
B = self.reorder_tensor_B(B, problem_size)
super().__init__(A, B, C, D, **kwargs)
# preprocessing output ops
if 'output_op' in kwargs.keys() and \
split_k_mode != cutlass.conv.SplitKMode.Parallel:
self.output_op = kwargs['output_op']
else:
self.output_op = self.operation.epilogue_type(1.0, 0.0)
if "split_k_slices" in kwargs.keys():
self.split_k_mode = split_k_mode
self.split_k_slices = kwargs["split_k_slices"]
else:
self.split_k_mode = cutlass.conv.SplitKMode.Serial
self.split_k_slices = 1
#: problem_size
self.problem_size: cutlass.conv.Conv2dProblemSize = problem_size
self.problem_size.split_k_slices = self.split_k_slices
if hasattr(self, "tensor_c_numel"):
c_coord = cutlass.conv.implicit_gemm_tensor_c_extent(
self.conv_kind, problem_size)
if (self.tensor_c_numel == c_coord.at(3) and
self.tensor_c_numel < c_coord.size()):
self.bias = True
#
# initialize the argument
#
self.initialize()
# @typechecked
def reorder_tensor_B(self, tensor_B: 'np.ndarray',
problem_size: 'cutlass.conv.Conv2dProblemSize'):
"""
Reorder tensor_B for interleaved layout
:param tensor_B: input tensor B
:type tensor_B: numpy.ndarray
:param problem_size: Conv2d problem size
:type problem_size: :class:`cutlass.conv.Conv2dProblemSize`
:return: reordered tensor B
:rtype: numpy.ndarray
"""
reordered_tensor_B = np.empty_like(tensor_B)
tensor_ref_B = self.get_tensor_ref(
tensor_B, self.element_B, self.layout_B, problem_size, "b")
reordered_tensor_ref_B = self.get_tensor_ref(
reordered_tensor_B, self.element_B,
self.layout_B, problem_size, "b")
cutlass.conv.host.reorder_convK(
reordered_tensor_ref_B, tensor_ref_B, self.conv_kind, problem_size)
return reordered_tensor_B
def get_tensor_ref(
self, tensor, dtype, tensor_layout, problem_size, operand):
if operand == "a":
tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(
self.conv_kind, problem_size)
elif operand == "b":
tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(
self.conv_kind, problem_size)
elif operand in ["c", "d"]:
tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(
self.conv_kind, problem_size)
else:
raise ValueError("unknown operand: " + operand)
# Zero stride trick
if operand == "c" and self.bias:
tensor_coord = cutlass.Tensor4DCoord(0, 0, 0, 0)
layout = tensor_layout.packed(tensor_coord)
return TensorRef(tensor, dtype, layout).tensor_ref
def get_arguments(self, semaphore):
ref_A = TensorRef_(self.get_tensor_ref(
self.ptr_A, self.element_A, self.layout_A, self.problem_size, "a"))
ref_B = TensorRef_(self.get_tensor_ref(
self.ptr_B, self.element_B, self.layout_B, self.problem_size, "b"))
ref_C = TensorRef_(self.get_tensor_ref(
self.ptr_C, self.element_C, self.layout_C, self.problem_size, "c"))
ref_D = TensorRef_(self.get_tensor_ref(
self.ptr_D, self.element_C, self.layout_C, self.problem_size, "d"))
self.c_arguments = self.operation.argument_type(
Conv2DProblemSize(self.problem_size),
ref_A, ref_B, ref_C, ref_D, self.output_op, self.split_k_mode
)
self.semaphore = semaphore
def initialize(self):
"""
Initialize the kernel arguments handling following stuffs
1. get kernel launch configuration including grid, cta size,
and dynamic shared memory capacity
2. allocate and initialize device workspace
3. get kernel params as bytearray for NVRTC input
"""
# get launch configuration
self.launch_config = self.operation.rt_module.plan(self)
# allocate and initialize device workspace
device_workspace_size = \
self.operation.rt_module.get_device_workspace_size(self)
if device_workspace_size > 0:
self.workspace_buffer = device_mem_alloc(device_workspace_size)
workspace_ptr = self.workspace_buffer.ptr
err, = cuda.cuMemsetD32(
workspace_ptr, 0, device_workspace_size // 4)
else:
workspace_ptr = None
# get kernel params as bytearray
semaphore = 0
if workspace_ptr is not None and \
self.split_k_mode == cutlass.conv.SplitKMode.Parallel:
self.ptr_D = workspace_ptr
elif workspace_ptr is not None and \
self.split_k_mode == cutlass.conv.SplitKMode.Serial:
semaphore = workspace_ptr
self.get_arguments(semaphore)
params_ = self.operation.rt_module.get_args(ctypes.byref(
self.c_arguments), ctypes.c_void_p(int(self.semaphore)))
self.host_workspace = bytearray(params_.contents)
self.device_workspace = None
def sync(self):
"""
Synchronize the arguments. If the input tensor is in host,
copy it from device to host.
"""
return super().sync()
# @typechecked
class Conv2dRT(ExecutableOperation):
"""
Conv2dRT manages the CUTLASS runtime components
"""
KernelTemplate = r'''
extern "C"
__global__ void
${operation_name}(${operation_name}${operation_suffix}::Params params) {
// Dynamic shared memory base pointer
extern __shared__ int SharedStorageBase[];
// Declare pointer to dynamic shared memory.
${operation_name}${operation_suffix}::SharedStorage *shared_storage =
reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
${operation_name}${operation_suffix} op;
op(params, *shared_storage);
}
'''
HostTemplate = r'''
extern "C" {
// Get the size of params in bytes
int ${operation_name}_get_param_size(){
return sizeof(${operation_name}${operation_suffix}::Params);
}
// Get the size of dynamic shared memory in bytes
int ${operation_name}_shared_memory_size() {
return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
}
// Get the params as byte array
char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Arguments* arguments, int *semaphore=nullptr){
typename ${operation_name}${operation_suffix}::Params* params;
params = new ${operation_name}${operation_suffix}::Params(*arguments, semaphore);
char *bytes = ((char*)(params));
char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
output[i] = bytes[i];
return output;
}
}
'''
def __init__(self, operation: 'Conv2dOperation'):
super().__init__(operation)
self.argument_type, self.epilogue_type = get_conv2d_arguments(operation.epilogue_functor)
self.argtype = [ctypes.POINTER(self.argument_type), ctypes.c_void_p]
self.conv_kind = operation.conv_kind
self.operation: Conv2dOperation = operation
self.emitter = EmitConv2dInstance('_type')
self.threads: int = operation.tile_description.num_threads
self.swizzle_functor = operation.swizzling_functor
def emit(self):
return self.emitter.emit(self.operation)
# @typechecked
def get_device_workspace_size(self, arguments: Conv2dArguments):
workspace_bytes = 0
launch_config = arguments.launch_config
self.conv_kind = self.operation.conv_kind
if arguments.split_k_mode == cutlass.conv.SplitKMode.Parallel:
problem_size = arguments.problem_size
workspace_bytes = DataTypeSize[self.operation.C.element] \
* launch_config.grid[2] * cutlass.conv.implicit_gemm_tensor_c_size(
self.conv_kind, problem_size
) // 8
elif arguments.split_k_mode == cutlass.conv.SplitKMode.Serial and \
arguments.split_k_slices > 1:
workspace_bytes = launch_config.grid[0] * launch_config.grid[1] * 4
return workspace_bytes
# @typechecked
def plan(self, arguments: Conv2dArguments):
tile_size = cutlass.gemm.GemmCoord(
self.operation.tile_description.threadblock_shape[0],
self.operation.tile_description.threadblock_shape[1],
self.operation.tile_description.threadblock_shape[2]
)
grid = self.swizzle_functor.get_grid_shape(
self.swizzle_functor.get_tiled_shape(
self.conv_kind, arguments.problem_size,
tile_size, arguments.split_k_slices
)
)
return LaunchConfiguration(
[grid.x, grid.y, grid.z], [self.threads, 1, 1],
self.shared_memory_capacity)
def initialize(self):
err, = cuda.cuFuncSetAttribute(
self.kernel,
attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
value=self.shared_memory_capacity)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
#
class Conv2dOperation:
"""
CUTLASS Conv2d operation description.
:param conv_kind: convolution operator
:type conv_kind: :class:`cutlass.conv.Operator`
:param iterator_algorithm: Selects among several implementation
variants trading off performance with simplicity
:type iterator_algorithm: :class:`cutlass.conv.IteratorAlgorithm`
:param arch: GPU compute capability (sm_xx)
:type arch: int
:param tile_description: tile description
:type tile_description: :class:`pycutlass.TileDescription`
:param A: tensor A description
:type A: :class:`pycutlass.TensorDescription`
:param B: tensor B description
:type B: :class:`pycutlass.TensorDescription`
:param C: tensor C description
:type C: :class:`pycutlass.TensorDescription`
:param D: tensor D description
:type D: :class:`pycutlass.TensorDescription`
:param element_epilogue: element type for computation in epilogue \
:type element_epilogue: cutlass.int8 | cutlass.int32 | cutlass.float16 | \
cutlass.bfloat16 | cutlass.float32 | cutlass.float64
:param stride_support: distinguish among partial specializations that \
accelerate certain problems where convolution stride is unit \
:type stride_support: :class:`cutlass.conv.StrideSupport`
:param epilogue_functor: convolution epilogue functor
:type epilogue_functor: :class:`EpilogueFunctor`
:param swizzling_functor: threadblock swizzling functor
"""
#
def __init__(self,
conv_kind: cutlass.conv.Operator,
iterator_algorithm: cutlass.conv.IteratorAlgorithm,
arch: int, tile_description: TileDescription,
A: TensorDescription, B: TensorDescription, C: TensorDescription,
stride_support, epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1):
self.operation_kind: OperationKind = OperationKind.Conv2d
self.arch: int = arch
self.tile_description: TileDescription = tile_description
self.conv_kind = conv_kind
self.A: TensorDescription = A
self.B: TensorDescription = B
self.C: TensorDescription = C
self.epilogue_functor = epilogue_functor
self.iterator_algorithm = iterator_algorithm
self.stride_support = stride_support
self.swizzling_functor = swizzling_functor()
self.rt_module: Conv2dRT = Conv2dRT(self)
self.argument_type = self.rt_module.argument_type
self.epilogue_type = self.rt_module.epilogue_type
def run(self, arguments: Conv2dArguments) -> cuda.CUresult:
"""
Launch the cuda kernel with input arguments
:param arguments: conv2d arguments
:type arguments: :class:`pycutlass.Conv2dArguments`
"""
# launch the kernel
err = self.rt_module.run(
arguments.host_workspace,
arguments.device_workspace,
arguments.launch_config)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('CUDA Error %s' % str(err))
return err
#
# Get function name
#
def procedural_name(self):
''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
return self.configuration_name()
#
def configuration_name(self):
''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
threadblock = "%dx%d_%dx%d" % (
self.tile_description.threadblock_shape[0],
self.tile_description.threadblock_shape[1],
self.tile_description.threadblock_shape[2],
self.tile_description.stages
)
if self.stride_support == StrideSupport.Unity:
configuration_name = "cutlass_sm${arch}_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride_align${alignment}"
else:
configuration_name = "cutlass_sm${arch}_${opcode_class}_${extended_name}_${threadblock}_${layout}_align${alignment}"
return SubstituteTemplate(
configuration_name,
{
'arch': str(self.arch),
'opcode_class': opcode_class_name,
'extended_name': self.extended_name(),
'threadblock': threadblock,
'layout': self.layout_name(),
'alignment': "%d" % self.A.alignment,
}
)
#
def extended_name(self):
''' Append data types if they differ from compute type. '''
if self.C.element != self.tile_description.math_instruction.element_accumulator and \
self.A.element != self.tile_description.math_instruction.element_accumulator:
extended_name = "${element_c}_${core_name}_${element_a}"
elif self.C.element == self.tile_description.math_instruction.element_accumulator and \
self.A.element != self.tile_description.math_instruction.element_accumulator:
extended_name = "${core_name}_${element_a}"
else:
extended_name = "${core_name}"
extended_name = SubstituteTemplate(extended_name, {
'element_a': DataTypeNames[self.A.element],
'element_c': DataTypeNames[self.C.element],
'core_name': self.core_name()
})
return extended_name
#
def layout_name(self):
return "%s" % (ShortLayoutTypeNames[self.A.layout])
#
def core_name(self):
''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
intermediate_type = ''
if self.tile_description.math_instruction.opcode_class == cutlass.OpClass.TensorOp:
inst_shape = "%dx%dx%d" % tuple(
self.tile_description.math_instruction.instruction_shape)
if self.tile_description.math_instruction.element_a != self.A.element and \
self.tile_description.math_instruction.element_a != self.accumulator_type():
intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
else:
inst_shape = ''
return "%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()],
inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
#
def is_complex(self):
complex_operators = [
MathOperation.multiply_add_complex,
MathOperation.multiply_add_complex_gaussian
]
return self.tile_description.math_instruction.math_operation in complex_operators
#
def accumulator_type(self):
accum = self.tile_description.math_instruction.element_accumulator
if self.is_complex():
return get_complex_from_real(accum)
return accum
###################################################################################################
#
# Emits single instances of a CUTLASS device-wide operator
#
###################################################################################################
class EmitConv2dInstance:
def __init__(self, operation_suffix=''):
self.operation_suffix = operation_suffix
self.includes = [
"cutlass/cutlass.h",
"cutlass/conv/kernel/default_conv2d_fprop.h",
"cutlass/conv/kernel/default_conv2d_dgrad.h",
"cutlass/conv/kernel/default_conv2d_wgrad.h"
]
self.template = """
// Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
using ${operation_name}_base =
typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
${element_a},
${layout_a},
${element_b},
${layout_b},
${element_c},
${layout_c},
${element_accumulator},
${opcode_class},
${arch},
cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
${epilogue_functor},
${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
${stages},
${math_operator},
${iterator_algorithm},
${stride_support},
${align_a},
${align_b}
>::Kernel;
struct ${operation_name}${operation_suffix}:
public ${operation_name}_base { };
"""
def emit(self, operation):
warp_shape = [int(operation.tile_description.threadblock_shape[idx] /
operation.tile_description.warp_count[idx]) for idx in range(3)]
epilogue_vector_length = int(min(
operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
values = {
'operation_name': operation.procedural_name(),
'operation_suffix': self.operation_suffix,
'conv_kind': ConvKindTag[operation.conv_kind],
'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
'element_a': DataTypeTag[operation.A.element],
'layout_a': LayoutTag[operation.A.layout],
'element_b': DataTypeTag[operation.B.element],
'layout_b': LayoutTag[operation.B.layout],
'element_c': DataTypeTag[operation.C.element],
'layout_c': LayoutTag[operation.C.layout],
'element_accumulator': DataTypeTag[operation.accumulator_type()],
'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
'arch': "cutlass::arch::Sm%d" % operation.arch,
'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
'warp_shape_m': str(warp_shape[0]),
'warp_shape_n': str(warp_shape[1]),
'warp_shape_k': str(warp_shape[2]),
'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
'epilogue_vector_length': str(epilogue_vector_length),
'epilogue_functor': operation.epilogue_functor.emit(),
'swizzling_functor': operation.swizzling_functor.tag(),
'stages': str(operation.tile_description.stages),
'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
'stride_support': StrideSupportTag[operation.stride_support],
'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else
MathOperationTag[operation.tile_description.math_instruction.math_operation],
'align_a': str(operation.A.alignment),
'align_b': str(operation.B.alignment),
}
return SubstituteTemplate(self.template, values)

File diff suppressed because it is too large Load Diff

View File

@ -1,104 +0,0 @@
################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
import numpy as np
from cuda import cuda
from pycutlass.memory_manager import *
from typing import TYPE_CHECKING
try:
import torch
torch_available = True
except ImportError:
torch_available = False
if TYPE_CHECKING:
import torch
try:
import cupy as cp
cupy_available = True
except ImportError:
cupy_available = False
if TYPE_CHECKING:
import cupy as cp
class NumpyFrontend:
"""
Frontend node for numpy
"""
@staticmethod
def argument(np_tensor: 'np.ndarray', is_output: 'bool') -> cuda.CUdeviceptr:
"""Convert the input numpy tensor to CUDA device pointer
:param np_tensor: input numpy nd array
:param is_output: whether the tensor is output
:return: CUDA device pointer
"""
# copy the data to device
if is_output:
return device_mem_alloc(np_tensor.size * np_tensor.itemsize)
else:
return todevice(np_tensor)
class TorchFrontend:
"""
Frontend node for torch
"""
@staticmethod
def argument(torch_tensor: 'torch.Tensor') -> cuda.CUdeviceptr:
"""Convert the input torch tensor to CUDA device pointer
:param torch_tensor: input torch tensor
:param is_output: whether the tensor is output
:return: CUDA device pointer
"""
# check the device of torch_tensor
if not torch_tensor.is_cuda:
torch_tensor = torch_tensor.to("cuda")
return cuda.CUdeviceptr(torch_tensor.data_ptr())
class CupyFrontend:
"""
Frontend node for cupy
"""
@staticmethod
def argument(cupy_ndarray: 'cp.ndarray'):
return cuda.CUdeviceptr(int(cupy_ndarray.data.ptr))

File diff suppressed because it is too large Load Diff

View File

@ -1,870 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import re
###################################################################################################
import enum
import cutlass
import cute
# The following block implements enum.auto() for Python 3.5 variants that don't include it such
# as the default 3.5.2 on Ubuntu 16.04.
#
# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility
try:
from enum import auto as enum_auto
except ImportError:
__cutlass_library_auto_enum = 0
def enum_auto() -> int:
global __cutlass_library_auto_enum
i = __cutlass_library_auto_enum
__cutlass_library_auto_enum += 1
return i
###################################################################################################
#
class GeneratorTarget(enum.Enum):
Library = enum_auto()
#
GeneratorTargetNames = {
GeneratorTarget.Library: 'library',
}
#
###################################################################################################
#
ShortDataTypeNames = {
cutlass.int32: 'i',
cutlass.float16: 'h',
cutlass.float32: 's',
cutlass.float64: 'd',
cutlass.dtype.cf32: 'c',
cutlass.dtype.cf64: 'z',
}
#
DataTypeNames = {
cutlass.dtype.b1: "b1",
cutlass.dtype.u4: "u4",
cutlass.dtype.u8: "u8",
cutlass.dtype.u16: "u16",
cutlass.dtype.u32: "u32",
cutlass.dtype.u64: "u64",
cutlass.dtype.s4: "s4",
cutlass.int8: "s8",
cutlass.dtype.s16: "s16",
cutlass.int32: "s32",
cutlass.dtype.s64: "s64",
cutlass.float16: "f16",
cutlass.bfloat16: "bf16",
cutlass.float32: "f32",
cutlass.tfloat32: "tf32",
cutlass.float64: "f64",
cutlass.dtype.cf16: "cf16",
cutlass.dtype.cbf16: "cbf16",
cutlass.dtype.cf32: "cf32",
cutlass.dtype.ctf32: "ctf32",
cutlass.dtype.cf64: "cf64",
cutlass.dtype.cu4: "cu4",
cutlass.dtype.cu8: "cu8",
cutlass.dtype.cu16: "cu16",
cutlass.dtype.cu32: "cu32",
cutlass.dtype.cu64: "cu64",
cutlass.dtype.cs4: "cs4",
cutlass.dtype.cs8: "cs8",
cutlass.dtype.cs16: "cs16",
cutlass.dtype.cs32: "cs32",
cutlass.dtype.cs64: "cs64",
}
DataTypeTag = {
cutlass.dtype.b1: "cutlass::uint1b_t",
cutlass.dtype.u4: "cutlass::uint4b_t",
cutlass.dtype.u8: "uint8_t",
cutlass.dtype.u16: "uint16_t",
cutlass.dtype.u32: "uint32_t",
cutlass.dtype.u64: "uint64_t",
cutlass.dtype.s4: "cutlass::int4b_t",
cutlass.int8: "int8_t",
cutlass.dtype.s16: "int16_t",
cutlass.int32: "int32_t",
cutlass.dtype.s64: "int64_t",
cutlass.float16: "cutlass::half_t",
cutlass.bfloat16: "cutlass::bfloat16_t",
cutlass.float32: "float",
cutlass.tfloat32: "cutlass::tfloat32_t",
cutlass.float64: "double",
cutlass.dtype.cf16: "cutlass::complex<cutlass::half_t>",
cutlass.dtype.cbf16: "cutlass::complex<cutlass::bfloat16_t>",
cutlass.dtype.cf32: "cutlass::complex<float>",
cutlass.dtype.ctf32: "cutlass::complex<cutlass::tfloat32_t>",
cutlass.dtype.cf64: "cutlass::complex<double>",
cutlass.dtype.cu4: "cutlass::complex<cutlass::uint4b_t>",
cutlass.dtype.cu8: "cutlass::complex<cutlass::uint8_t>",
cutlass.dtype.cu16: "cutlass::complex<cutlass::uint16_t>",
cutlass.dtype.cu32: "cutlass::complex<cutlass::uint32_t>",
cutlass.dtype.cu64: "cutlass::complex<cutlass::uint64_t>",
cutlass.dtype.cs4: "cutlass::complex<cutlass::int4b_t>",
cutlass.dtype.cs8: "cutlass::complex<cutlass::int8_t>",
cutlass.dtype.cs16: "cutlass::complex<cutlass::int16_t>",
cutlass.dtype.cs32: "cutlass::complex<cutlass::int32_t>",
cutlass.dtype.cs64: "cutlass::complex<cutlass::int64_t>",
}
DataTypeSize = {
cutlass.dtype.b1: 1,
cutlass.dtype.u4: 4,
cutlass.dtype.u8: 8,
cutlass.dtype.u16: 16,
cutlass.dtype.u32: 32,
cutlass.dtype.u64: 64,
cutlass.dtype.s4: 4,
cutlass.int8: 8,
cutlass.dtype.s16: 16,
cutlass.int32: 32,
cutlass.dtype.s64: 64,
cutlass.float16: 16,
cutlass.bfloat16: 16,
cutlass.float32: 32,
cutlass.tfloat32: 32,
cutlass.float64: 64,
cutlass.dtype.cf16: 32,
cutlass.dtype.cbf16: 32,
cutlass.dtype.cf32: 64,
cutlass.dtype.ctf32: 32,
cutlass.dtype.cf64: 128,
cutlass.dtype.cu4: 8,
cutlass.dtype.cu8: 16,
cutlass.dtype.cu16: 32,
cutlass.dtype.cu32: 64,
cutlass.dtype.cu64: 128,
cutlass.dtype.cs4: 8,
cutlass.dtype.cs8: 16,
cutlass.dtype.cs16: 32,
cutlass.dtype.cs32: 64,
cutlass.dtype.cs64: 128,
}
class DataTypeSizeBytes:
"""
Static class to mimic the `DataTypeSize` dictionary, but with checks for whether the
data type key is less than a full byte or a non-integer number of bytes.
"""
@staticmethod
def __class_getitem__(datatype):
"""
Returns the number of bytes in size the data type is. Raises an exception if the data type
is either less than a full byte or a non-integer number of bytes in size.
:param datatype: data type to query
:return: number of bytes the data type occupies
:rtype: int
"""
bits = DataTypeSize[datatype]
if bits < 8:
raise Exception('Data type {} is less than one byte in size.'.format(datatype))
elif bits % 8 != 0:
raise Exception('Data type {} is not an integer number of bytes.'.format(datatype))
return bits // 8
###################################################################################################
#
class BlasMode(enum.Enum):
symmetric = enum_auto()
hermitian = enum_auto()
#
BlasModeTag = {
BlasMode.symmetric: 'cutlass::BlasMode::kSymmetric',
BlasMode.hermitian: 'cutlass::BlasMode::kHermitian',
}
#
ComplexTransformTag = {
cutlass.complex_transform.none: 'cutlass::ComplexTransform::kNone',
cutlass.complex_transform.conj: 'cutlass::ComplexTransform::kConjugate',
}
#
RealComplexBijection = [
(cutlass.float16, cutlass.dtype.cf16),
(cutlass.float32, cutlass.dtype.cf32),
(cutlass.float64, cutlass.dtype.cf64),
]
#
def is_complex(data_type):
for r, c in RealComplexBijection:
if data_type == c:
return True
return False
#
def get_complex_from_real(real_type):
for r, c in RealComplexBijection:
if real_type == r:
return c
return cutlass.dtype.invalid
#
def get_real_from_complex(complex_type):
for r, c in RealComplexBijection:
if complex_type == c:
return r
return cutlass.dtype.invalid
#
class ComplexMultiplyOp(enum.Enum):
multiply_add = enum_auto()
gaussian = enum_auto()
###################################################################################################
#
class MathOperation(enum.Enum):
multiply_add = enum_auto()
multiply_add_saturate = enum_auto()
xor_popc = enum_auto()
multiply_add_fast_bf16 = enum_auto()
multiply_add_fast_f16 = enum_auto()
multiply_add_fast_f32 = enum_auto()
multiply_add_complex_fast_f32 = enum_auto()
multiply_add_complex = enum_auto()
multiply_add_complex_gaussian = enum_auto()
#
MathOperationNames = {
MathOperation.multiply_add: 'multiply_add',
MathOperation.multiply_add_saturate: 'multiply_add_saturate',
MathOperation.xor_popc: 'xor_popc',
MathOperation.multiply_add_fast_bf16: 'multiply_add_fast_bf16',
MathOperation.multiply_add_fast_f16: 'multiply_add_fast_f16',
MathOperation.multiply_add_fast_f32: 'multiply_add_fast_f32',
MathOperation.multiply_add_complex_fast_f32: 'multiply_add_complex_fast_f32',
MathOperation.multiply_add_complex: 'multiply_add_complex',
MathOperation.multiply_add_complex_gaussian: 'multiply_add_complex_gaussian',
}
#
MathOperationTag = {
MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd',
MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate',
MathOperation.xor_popc: 'cutlass::arch::OpXorPopc',
MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16',
MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16',
MathOperation.multiply_add_fast_f32: 'cutlass::arch::OpMultiplyAddFastF32',
MathOperation.multiply_add_complex_fast_f32: 'cutlass::arch::OpMultiplyAddComplexFastF32',
MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex',
MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex',
}
###################################################################################################
#
LayoutTag = {
cutlass.ColumnMajor: 'cutlass::layout::ColumnMajor',
cutlass.RowMajor: 'cutlass::layout::RowMajor',
cutlass.layout.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>',
cutlass.layout.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>',
cutlass.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>',
cutlass.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>',
cutlass.layout.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>',
cutlass.layout.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>',
cutlass.TensorNHWC: 'cutlass::layout::TensorNHWC',
cutlass.layout.TensorNDHWC: 'cutlass::layout::TensorNDHWC',
cutlass.layout.TensorNCHW: 'cutlass::layout::TensorNCHW',
cutlass.layout.TensorNGHWC: 'cutlass::layout::TensorNGHWC',
cutlass.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>',
cutlass.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>',
cutlass.layout.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>',
cutlass.layout.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>',
}
#
TransposedLayout = {
cutlass.ColumnMajor: cutlass.RowMajor,
cutlass.RowMajor: cutlass.ColumnMajor,
cutlass.layout.ColumnMajorInterleaved2: cutlass.layout.RowMajorInterleaved2,
cutlass.layout.RowMajorInterleaved2: cutlass.layout.ColumnMajorInterleaved2,
cutlass.ColumnMajorInterleaved32: cutlass.RowMajorInterleaved32,
cutlass.RowMajorInterleaved32: cutlass.ColumnMajorInterleaved32,
cutlass.layout.ColumnMajorInterleaved64: cutlass.layout.RowMajorInterleaved64,
cutlass.layout.RowMajorInterleaved64: cutlass.layout.ColumnMajorInterleaved64,
cutlass.TensorNHWC: cutlass.TensorNHWC
}
#
ShortLayoutTypeNames = {
cutlass.ColumnMajor: 'n',
cutlass.layout.ColumnMajorInterleaved2: 'n2',
cutlass.ColumnMajorInterleaved32: 'n32',
cutlass.layout.ColumnMajorInterleaved64: 'n64',
cutlass.RowMajor: 't',
cutlass.layout.RowMajorInterleaved2: 't2',
cutlass.RowMajorInterleaved32: 't32',
cutlass.layout.RowMajorInterleaved64: 't64',
cutlass.TensorNHWC: 'nhwc',
cutlass.layout.TensorNDHWC: 'ndhwc',
cutlass.layout.TensorNCHW: 'nchw',
cutlass.layout.TensorNGHWC: 'nghwc',
cutlass.TensorNC32HW32: 'nc32hw32',
cutlass.layout.TensorNC64HW64: 'nc64hw64',
cutlass.TensorC32RSK32: 'c32rsk32',
cutlass.layout.TensorC64RSK64: 'c64rsk64'
}
#
ShortComplexLayoutNames = {
(cutlass.ColumnMajor, cutlass.complex_transform.none): 'n',
(cutlass.ColumnMajor, cutlass.complex_transform.conj): 'c',
(cutlass.RowMajor, cutlass.complex_transform.none): 't',
(cutlass.RowMajor, cutlass.complex_transform.conj): 'h'
}
#
CuTeLayoutTag = {
cute.GMMAMajor.K: 'cute::GMMA::Major::K',
cute.GMMAMajor.MN: 'cute::GMMA::Major::MN'
}
###################################################################################################
#
class SideMode(enum.Enum):
Left = enum_auto()
Right = enum_auto()
#
SideModeTag = {
SideMode.Left: 'cutlass::SideMode::kLeft',
SideMode.Right: 'cutlass::SideMode::kRight'
}
#
ShortSideModeNames = {
SideMode.Left: 'ls',
SideMode.Right: 'rs'
}
###################################################################################################
#
class FillMode(enum.Enum):
Lower = enum_auto()
Upper = enum_auto()
#
FillModeTag = {
FillMode.Lower: 'cutlass::FillMode::kLower',
FillMode.Upper: 'cutlass::FillMode::kUpper'
}
#
ShortFillModeNames = {
FillMode.Lower: 'l',
FillMode.Upper: 'u'
}
###################################################################################################
#
class DiagType(enum.Enum):
NonUnit = enum_auto()
Unit = enum_auto()
#
DiagTypeTag = {
DiagType.NonUnit: 'cutlass::DiagType::kNonUnit',
DiagType.Unit: 'cutlass::DiagType::kUnit'
}
#
ShortDiagTypeNames = {
DiagType.NonUnit: 'nu',
DiagType.Unit: 'un'
}
###################################################################################################
OpcodeClassNames = {
cutlass.OpClass.Simt: 'simt',
cutlass.OpClass.TensorOp: 'tensorop',
cutlass.OpClass.WmmaTensorOp: 'wmma_tensorop',
cutlass.OpClass.SparseTensorOp: 'sptensorop'
}
OpcodeClassTag = {
cutlass.OpClass.Simt: 'cutlass::arch::OpClassSimt',
cutlass.OpClass.TensorOp: 'cutlass::arch::OpClassTensorOp',
cutlass.OpClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp',
cutlass.OpClass.SparseTensorOp: 'cutlass::arch::OpClassSparseTensorOp'
}
###################################################################################################
#
class OperationKind(enum.Enum):
Gemm = enum_auto()
RankK = enum_auto()
Rank2K = enum_auto()
Trmm = enum_auto()
Symm = enum_auto()
Conv2d = enum_auto()
Conv3d = enum_auto()
#
OperationKindNames = {
OperationKind.Gemm: 'gemm', OperationKind.RankK: 'rank_k', OperationKind.Rank2K: 'rank_2k', OperationKind.Trmm: 'trmm', OperationKind.Symm: 'symm', OperationKind.Conv2d: 'conv2d', OperationKind.Conv3d: 'conv3d'
}
#
ArchitectureNames = {
50: 'maxwell',
60: 'pascal',
61: 'pascal',
70: 'volta',
75: 'turing',
80: 'ampere',
90: 'hopper'
}
#
SharedMemPerCC = {
70: 96 << 10, # 96KB of SMEM
72: 96 << 10, # 96KB of SMEM
75: 64 << 10, # 64KB of SMEM
80: 160 << 10, # 164KB of SMEM - 4KB reserved for the driver
86: 100 << 10, # 100KB of SMEM
87: 160 << 10, # 164KB of SMEM - 4KB reserved for the driver
89: 100 << 10, # 100KB of SMEM
90: 227 << 10, # 228KB of SMEM - 1KB reserved for the driver
}
###################################################################################################
class GemmKind(enum.Enum):
Gemm = enum_auto()
Sparse = enum_auto()
Universal = enum_auto()
PlanarComplex = enum_auto()
PlanarComplexArray = enum_auto()
Grouped = enum_auto()
#
GemmKindNames = {
GemmKind.Gemm: "gemm",
GemmKind.Sparse: "spgemm",
GemmKind.Universal: "gemm",
GemmKind.PlanarComplex: "gemm_planar_complex",
GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
GemmKind.Grouped: "gemm_grouped"
}
#
class RankKKind(enum.Enum):
Universal = enum_auto()
#
RankKKindNames = {
RankKKind.Universal: "rank_k"
}
#
class TrmmKind(enum.Enum):
Universal = enum_auto()
#
TrmmKindNames = {
TrmmKind.Universal: "trmm"
}
#
class SymmKind(enum.Enum):
Universal = enum_auto()
#
SymmKindNames = {
SymmKind.Universal: "symm"
}
#
class SwizzlingFunctor(enum.Enum):
Identity1 = enum_auto()
Identity2 = enum_auto()
Identity4 = enum_auto()
Identity8 = enum_auto()
Horizontal = enum_auto()
BatchedIdentity1 = enum_auto()
StridedDgradIdentity1 = enum_auto()
StridedDgradIdentity4 = enum_auto()
StridedDgradHorizontal = enum_auto()
#
SwizzlingFunctorTag = {
cutlass.IdentitySwizzle1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>',
SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>',
SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>',
SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>',
SwizzlingFunctor.Horizontal: 'cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle',
SwizzlingFunctor.BatchedIdentity1: "cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle",
SwizzlingFunctor.StridedDgradIdentity1: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>',
SwizzlingFunctor.StridedDgradIdentity4: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>',
SwizzlingFunctor.StridedDgradHorizontal: 'cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle',
}
#
class SchedulerMode(enum.Enum):
Device = enum_auto(),
Host = enum_auto()
#
SchedulerModeTag = {
SchedulerMode.Device: 'cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly',
SchedulerMode.Host: 'cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute'
}
#
ShortSchedulerModeNames = {
SchedulerMode.Device: 'Device',
SchedulerMode.Host: 'Host'
}
###################################################################################################
#
ConvKindTag = {
cutlass.conv.Operator.fprop: 'cutlass::conv::Operator::kFprop',
cutlass.conv.Operator.dgrad: 'cutlass::conv::Operator::kDgrad',
cutlass.conv.Operator.wgrad: 'cutlass::conv::Operator::kWgrad'
}
ConvKindNames = {
cutlass.conv.Operator.fprop: 'fprop',
cutlass.conv.Operator.dgrad: 'dgrad',
cutlass.conv.Operator.wgrad: 'wgrad',
}
#
IteratorAlgorithmTag = {
cutlass.conv.IteratorAlgorithm.analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic',
cutlass.conv.IteratorAlgorithm.optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized',
cutlass.conv.IteratorAlgorithm.fixed_channels: 'cutlass::conv::IteratorAlgorithm::kFixedChannels',
cutlass.conv.IteratorAlgorithm.few_channels: 'cutlass::conv::IteratorAlgorithm::kFewChannels'
}
IteratorAlgorithmNames = {
cutlass.conv.IteratorAlgorithm.analytic: 'analytic',
cutlass.conv.IteratorAlgorithm.optimized: 'optimized',
cutlass.conv.IteratorAlgorithm.fixed_channels: 'fixed_channels',
cutlass.conv.IteratorAlgorithm.few_channels: 'few_channels'
}
#
class StrideSupport(enum.Enum):
Strided = enum_auto()
Unity = enum_auto()
#
StrideSupportTag = {
StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided',
StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity',
}
StrideSupportNames = {
StrideSupport.Strided: '',
StrideSupport.Unity: 'unity_stride',
}
class ConvMode(enum.Enum):
CrossCorrelation = enum_auto()
Convolution = enum_auto()
#
ConvModeTag = {
ConvMode.CrossCorrelation: 'cutlass::conv::Mode::kCrossCorrelation',
ConvMode.Convolution: 'cutlass::conv::Mode::kConvolution'
}
###################################################################################################
#
class MathInstruction:
"""
Description of a the lowest-level matrix-multiply-accumulate operation to be used in a kernel
"""
def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class=cutlass.OpClass.Simt, math_operation=MathOperation.multiply_add):
"""
:param instruction_shape: size of the [M, N, K] dimensions of the instruction
:type instruction_shape: list or tuple
:param element_a: data type of operand A
:param element_b: data type of operand B
:param element_accumulator: data type used in accumulation
:param opcode_class: higher-level class of the instruction (e.g., SIMT or Tensor Core)
:type opcode_class: cutlass.OpClass
:param math_operation: the type of low-level operation to be performed (e.g., multiply accumulate)
:type math_operation: MathOperation
"""
self.instruction_shape = instruction_shape
self.element_a = element_a
self.element_b = element_b
self.element_accumulator = element_accumulator
self.opcode_class = opcode_class
self.math_operation = math_operation
#
class TileDescription:
"""
Description of a tile of computation to be performed in the kernel, encompassing threadblock, cluster, and warp shapes,
stage count, and math instruction specification
"""
def __init__(self, threadblock_shape, stages, warp_count, math_instruction, cluster_shape=[1, 1, 1], persistent=False):
"""
:param threadblock_shape: shape of a threadblock tyle
:type threadblock_shape: list or tuple
:param stages: number of pipeline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum
number of stages that can be supported for an operation on a given architecture will be computed at a later time
:type stages: int or None
:param warp_count: number of warps in each [M, N, K] dimension of a threadblock tile
:type warp_count: list, tuple, or None
:param math_instruction: specification of the instruction type and shape to be performed and the types of its operands
:type math_instruction: MathInstruction
:param cluster_shape: number of threadblocks in the [X, Y, Z] dimensions of a threadblock cluster
:param persistent: whether the kernel uses persistent warp-specialized threadblocks (only available for SM90+)
:type persistent: bool
"""
self.threadblock_shape = threadblock_shape
self.cluster_shape = cluster_shape
self.persistent: bool = persistent
self.stages: int = stages
self.math_instruction = math_instruction
# Number of warps along x, y, z directions
self.warp_count = warp_count
@property
def num_threads(self):
"""
Returns the number of threads in the threadblock
:return: number of threads in the threadblock
:rtype: int or None (if warp count is None)
"""
if self.warp_count is not None:
threads = 32
for cnt in self.warp_count:
threads *= cnt
return threads
return None
def procedural_name(self):
"""
Returns a name identifying the tile description
:return: name identifying the tile description
:rtype: int
"""
emit_stages = 0 if self.stages is None else self.stages
name = "%dx%dx%d_%dx%d_%dx%d" % (
self.cluster_shape[0], self.cluster_shape[1], self.cluster_shape[2],
self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], emit_stages)
if self.persistent:
name += '_persistent'
return name
#
class TensorDescription:
def __init__(self, element, layout, alignment=1, complex_transform=cutlass.complex_transform.none):
self.element = element
self.layout = layout
self.alignment = min(128 // DataTypeSize[self.element], alignment)
self.complex_transform = complex_transform
#
class SymmetricTensorDescription:
def __init__(self, element, layout, fill_mode, alignment=1, complex_transform=cutlass.complex_transform.none, side_mode=SideMode.Left):
self.element = element
self.layout = layout
self.fill_mode = fill_mode
self.alignment = alignment
self.complex_transform = complex_transform
self.side_mode = side_mode
#
class TriangularTensorDescription:
def __init__(self, element, layout, side_mode, fill_mode, diag_type, alignment=1, complex_transform=cutlass.complex_transform.none):
self.element = element
self.layout = layout
self.side_mode = side_mode
self.fill_mode = fill_mode
self.diag_type = diag_type
self.alignment = alignment
self.complex_transform = complex_transform
###################################################################################################
#
def CalculateSmemUsagePerStage(operation):
"""
Returns the amount of shared memory in bytes consumed in a single stage of a kernel.
:param op: operation for which the maximum stages should be computed. If stages are
set via the `op.tile_description.stages` parameter, this setting is ignored
in the present calculation
:type op: pycutlass.Operation
:return: number of bytes of shared memory consumed by a single stage
:rtype: int
"""
m, n, k = operation.tile_description.threadblock_shape
if operation.operation_kind == OperationKind.Gemm:
stage_barrier_bytes = 32
return (DataTypeSize[operation.A.element] * m * k // 8) + \
(DataTypeSize[operation.B.element] * k * n // 8) + stage_barrier_bytes
else:
raise Exception('Unsupported operation kind {}.'.format(operation.operation_kind))
#
def CalculateSmemUsage(operation):
"""
Returns the amount of shared memory in bytes consumed by a kernel.
:param op: operation for which the maximum stages should be computed. If stages are
set via the `op.tile_description.stages` parameter, this setting is ignored
in the present calculation
:type op: pycutlass.Operation
:return: int
"""
return operation.tile_description.stages * CalculateSmemUsagePerStage(operation)
class ApiVersion(enum.Enum):
"""
Differentiate between CUTLASS 2.x and 3.x API versions
"""
v2x = enum_auto()
v3x = enum_auto()
def api_version(arch, opclass, datatype):
"""
Returns whether the architecture, opcode class, and datatype in question require using CUTLASS 2.x
or 3.x for code emission.
:param arch: compute capability of device on which to run
:type arch: int
:param opclass: class of the operation being performed
:type opclass: cutlass.OpClass
:param datatype: data type to be used in operation (assumes that ElementA and ElementB are the same)
:return: API version to be used in code emission
:rtype: ApiVersion
"""
if arch >= 90 and opclass == cutlass.OpClass.TensorOp and (datatype != cutlass.float64):
return ApiVersion.v3x
else:
return ApiVersion.v2x
###################################################################################################

View File

@ -1,74 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import rmm
import numpy as np
class PoolMemoryManager:
def __init__(self, init_pool_size: int, max_pool_size: int) -> None:
self.pool = rmm.mr.PoolMemoryResource(
rmm.mr.CudaMemoryResource(),
initial_pool_size=init_pool_size,
maximum_pool_size=max_pool_size
)
self.mr = rmm.mr.TrackingResourceAdaptor(self.pool)
rmm.mr.set_current_device_resource(self.mr)
def get_allocated_size(self):
return self.mr.get_allocated_bytes()
def pool_size(self):
return self.pool.pool_size()
def todevice(host_data, dtype=np.float32):
"""
Pass the host_data to device memory
"""
if isinstance(host_data, list):
return rmm.DeviceBuffer.to_device(np.array(host_data, dtype=dtype).tobytes())
elif isinstance(host_data, np.ndarray):
return rmm.DeviceBuffer.to_device(host_data.tobytes())
def device_mem_alloc(size):
return rmm.DeviceBuffer(size=size)
def align_size(size, alignment=256):
return ((size + alignment - 1) // alignment) * alignment
def get_allocated_size():
device_resource = rmm.mr.get_current_device_resource()
return device_resource.get_allocated_bytes()

View File

@ -1,153 +0,0 @@
################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
import ctypes
from cuda import cuda
from pycutlass.utils.device import device_cc
from cuda import __version__ as __cuda_version__
_version_splits = [int(x) for x in __cuda_version__.split('.')]
supports_cluster_launch = device_cc() >= 90 and (_version_splits[0] > 11 or (_version_splits[0] == 11 and _version_splits[1] >= 8))
################################################################################
#
# Launch configuration
#
################################################################################
class LaunchConfiguration:
def __init__(self, grid=[1, 1, 1], block=[1, 1, 1], smem=0):
self.grid = grid
self.block = block
self.shared_memory_capacity = smem
################################################################################
#
# Base class for an executable operation
#
# ##############################################################################
class ExecutableOperation:
'''
'''
def __init__(self, operation):
self.operation = operation
self.module = None
self.kernel = None
#
def name(self):
return self.operation.procedural_name()
#
def emit(self):
return ''
#
def can_implement(self, configuration, arguments):
raise NotImplementedError()
#
def get_host_workspace_size(self, arguments):
raise NotImplementedError()
#
def get_device_workspace_size(self, arguments):
raise NotImplementedError()
#
def plan(self, arguments):
raise NotImplementedError()
#
def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream=cuda.CUstream(0)):
raise NotImplementedError()
#
def run_with_clusters(self, launch_config, kernel_params, stream=cuda.CUstream(0)):
if hasattr(self.operation, 'tile_description') and hasattr(self.operation.tile_description, 'cluster_shape'):
attr = cuda.CUlaunchAttribute()
attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.operation.tile_description.cluster_shape
attr.id = cuda.CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
attrs = [attr]
# Allow for non-portable cluster sizes
err, = cuda.cuFuncSetAttribute(
self.kernel, cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)
if err != cuda.CUresult.CUDA_SUCCESS:
return err
else:
attrs = []
config = cuda.CUlaunchConfig()
config.gridDimX, config.gridDimY, config.gridDimZ = launch_config.grid
config.blockDimX, config.blockDimY, config.blockDimZ = launch_config.block
config.blockDimZ = launch_config.block[2]
config.sharedMemBytes = launch_config.shared_memory_capacity
config.hStream = stream
config.attrs = attrs
config.numAttrs = len(attrs)
err, = cuda.cuLaunchKernelEx(config, f=self.kernel, kernelParams=kernel_params, extra=0)
return err
#
def run_without_clusters(self, launch_config, kernel_params, stream=cuda.CUstream(0)):
err, = cuda.cuLaunchKernel(
self.kernel,
launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
launch_config.block[0], launch_config.block[1], launch_config.block[2],
launch_config.shared_memory_capacity,
stream,
kernel_params,
0)
return err
#
def run(self, host_workspace, device_workspace, launch_config, stream=cuda.CUstream(0)):
cArg = (ctypes.c_char * len(host_workspace)
).from_buffer(host_workspace)
packed = (ctypes.c_void_p * 1)()
packed[0] = ctypes.addressof(cArg)
if supports_cluster_launch:
return self.run_with_clusters(launch_config, packed, stream)
else:
return self.run_without_clusters(launch_config, packed, stream)

View File

@ -1,614 +0,0 @@
################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
from typing import Generic, TypeVar
from treelib import Tree
import numpy as np
from pycutlass import *
import pycutlass
import ast
import textwrap
import inspect
################################################################################
# Type annotation for input arguments
################################################################################
Ttype = TypeVar("Ttype")
Dtype = TypeVar("Dtype")
class NDArray(np.ndarray, Generic[Ttype, Dtype]):
pass
################################################################################
# Operations
################################################################################
operators = {
ast.Add: "Add",
ast.Div: "Div",
ast.Eq: "Equal",
ast.Mult: "Mult"
}
################################################################################
# AST Node abstractions
################################################################################
class UnaryNode:
cnt = 0
# Concept: this is created by the BinOp Node in python ast
def __init__(self,
element_accumulator, element_compute, elements_per_access,
node, args) -> None:
if isinstance(node, BinOpNode):
self.op = node.op
elif isinstance(node, ast.Call):
if isinstance(node.func, ast.Name):
self.op = node.func.id
elif isinstance(node.func, ast.Attribute):
self.op = node.func.value.id
else:
raise TypeError
else:
raise TypeError
self.tag = "Unary" + self.op + str(UnaryNode.cnt)
self.id = self.op + str(UnaryNode.cnt)
self.args = args
UnaryNode.cnt += 1
self.type = "tensor"
self.epilogue_op = getattr(pycutlass, self.op)(element_compute)
# data types
self.element_accumulator = element_accumulator
self.element_compute = element_compute
self.elements_per_access = elements_per_access
def get_epilogue_node(self, visitors):
self.epilogue_node = UnaryOp(
self.element_accumulator, self.element_compute,
self.elements_per_access, *visitors, self.epilogue_op)
def get_argument(self, visitor_args, kwargs):
epilogue_ops = []
for arg in self.args:
try:
epilogue_ops.append(kwargs[arg])
except:
epilogue_ops.append(arg) # direct arguments like constant
self.argument = self.epilogue_node.argument_type(self.epilogue_op.argument_type(*epilogue_ops), *visitor_args)
class BinOpNode:
cnt = 0
# Concept: this is created by the BinOp Node in python ast
def __init__(self,
element_accumulator, element_compute, elements_per_access,
node) -> None:
self.op = operators[type(node.op)]
self.tag = "Binary" + self.op + str(BinOpNode.cnt)
self.id = self.op + str(BinOpNode.cnt)
self.args = None
BinOpNode.cnt += 1
self.type = "tensor"
self.epilogue_op = getattr(pycutlass, "Vector"+self.op)(element_compute)
# data types
self.element_accumulator = element_accumulator
self.element_compute = element_compute
self.elements_per_access = elements_per_access
def get_epilogue_node(self, visitors):
self.epilogue_node = BinaryOp(
self.element_accumulator, self.element_compute,
self.elements_per_access, *visitors, self.epilogue_op)
def get_argument(self, visitor_args, kwargs):
self.argument = self.epilogue_node.argument_type(self.epilogue_op.argument_type(self.args), *visitor_args)
class NameNode:
# Concept: this is created by the Name Node in python ast
def __init__(self, node) -> None:
try:
self.id = node.id
except:
self.id = node.targets[0].id
self.tag = self.id
class ScalarInputNode(NameNode):
# Concept: scalar
def __init__(self, node) -> None:
super().__init__(node)
self.tag = "Scalar:" + self.tag
self.type = "scalar"
class AccumulatorNode(NameNode):
# Concept: VisitorOpAccumulator
def __init__(self,
element_accumulator, elements_per_access, node) -> None:
super().__init__(node)
self.tag = "Accum:" + self.tag
self.type = "tensor"
self.element_accumulator = element_accumulator
self.elements_per_access = elements_per_access
def get_epilogue_node(self, visitors):
self.epilogue_node = AccumulatorOp(
self.element_accumulator, self.elements_per_access)
def get_argument(self, visitor_args, kwargs):
self.argument = self.epilogue_node.argument_type()
class TensorInputNode(NameNode):
# Concept: VisitorOpTensorInput
def __init__(self, element_accumulator, node) -> None:
super().__init__(node)
self.tag = "TensorInput:" + self.tag
self.type = "tensor"
self.element_accumulator = element_accumulator
def get_epilogue_node(self, *args):
self.epilogue_node = TensorInputOp(self.element_accumulator)
def get_argument(self, visitor_args, kwargs):
self.argument = self.epilogue_node.argument_type(
kwargs[self.id + "_ptr"], kwargs["problem_size"][1],
kwargs["problem_size"][0] * kwargs["problem_size"][1])
class RowBroadcastNode(NameNode):
# Concept: VisitorOpRowBroadcast
def __init__(self, element_accumulator, element_fragment, node) -> None:
super().__init__(node)
#
self.tag = "RowBroadcast:" + self.tag
self.type = "tensor"
self.element_accumulator = element_accumulator
self.element_fragment = element_fragment
def get_epilogue_node(self, *args):
self.epilogue_node = RowBroadcastOp(
self.element_accumulator, self.element_fragment)
def get_argument(self, visitor_args, kwargs):
self.argument = self.epilogue_node.argument_type(kwargs[self.id + "_ptr"], kwargs["problem_size"][1])
class ColumnBroadcastNode(NameNode):
# Concept: VisitorOpColumnBroadcast
def __init__(self, element_accumulator, element_fragment, node) -> None:
super().__init__(node)
self.tag = "ColumnBroadcast:" + self.tag
self.type = "tensor"
self.element_accumulator = element_accumulator
self.element_fragment = element_fragment
def get_epilogue_node(self, *args):
self.epilogue_node = ColumnBroadcastOp(
self.element_accumulator, self.element_fragment)
def get_argument(self, visitor_args, kwargs):
self.argument = self.epilogue_node.argument_type(kwargs[self.id + "_ptr"], kwargs["problem_size"][0])
class TensorOutputNode(NameNode):
# Concept: VisitorOpTensorOutput
def __init__(self, element_accumulator, node) -> None:
super().__init__(node)
self.tag = "TensorOutput:" + self.tag
self.type = "tensor"
self.element_accumulator = element_accumulator
def get_epilogue_node(self, visitors):
self.epilogue_node = TensorOutputOp(self.element_accumulator, *visitors)
def get_argument(self, visitor_args, kwargs):
self.argument = self.epilogue_node.argument_type(kwargs[self.id + "_ptr"], kwargs["problem_size"][1], *visitor_args, kwargs["problem_size"][0] * kwargs["problem_size"][1])
class RowReductionNode:
# Concept: RowReductionOp
def __init__(self, element_accumulator, element_reduction,
element_reduction_accumulator, id, factor) -> None:
#
self.id = id
self.tag = "RowReduction:" + self.id
self.type = "tensor"
self.element_accumulator = element_accumulator
self.element_reduction = element_reduction
self.element_reduction_accumulator = element_reduction_accumulator
self.factor = factor
def get_epilogue_node(self, visitors):
self.epilogue_node = RowReductionOp(
self.element_accumulator, self.element_reduction,
self.element_reduction_accumulator, *visitors)
def get_batch_stride(self, problem_size):
return problem_size[0] * ((problem_size[1] + self.factor - 1) // self.factor)
def get_argument(self, visitor_args, kwargs):
self.argument = self.epilogue_node.argument_type(kwargs[self.id + "_ptr"], *visitor_args, self.get_batch_stride(kwargs["problem_size"]))
class ColumnReductionNode:
# Concept: ColumnReductionOp
def __init__(self, element_accumulator, element_reduction,
element_reduction_accumulator, id, factor) -> None:
#
self.id = id
self.tag = "ColumnReduction:" + self.id
self.type = "tensor"
self.element_accumulator = element_accumulator
self.element_reduction = element_reduction
self.element_reduction_accumulator = element_reduction_accumulator
self.factor = factor
def get_epilogue_node(self, visitors):
self.epilogue_node = ColumnReductionOp(
self.element_accumulator, self.element_reduction,
self.element_reduction_accumulator, *visitors)
def get_batch_stride(self, problem_size):
return problem_size[1] * ((problem_size[0] + self.factor - 1) // self.factor)
def get_argument(self, visitor_args, kwargs):
self.argument = self.epilogue_node.argument_type(kwargs[self.id + '_ptr'], *visitor_args, self.get_batch_stride(kwargs["problem_size"]))
################################################################################
# Epilogue parser function
################################################################################
class EpilogueAST(ast.NodeVisitor):
def __init__(self, epilogue,
tile_description,
element_accumulator, elements_per_access,
element_compute, element_output) -> None:
#
self.tile_description = tile_description
self.element_accumulator = element_accumulator
self.elements_per_access = elements_per_access
self.element_compute = element_compute
self.element_output = element_output
self.epilogue = epilogue
self.source = textwrap.dedent(inspect.getsource(epilogue.__call__))
self.ast_tree = ast.parse(self.source)
self.epilogue_tree = Tree()
# print(ast.dump(self.ast_tree, indent=4)) # For Debug purpose
# input arguments
self.input_args = {}
# return nodes
self.returns = []
# reduction source nodes
self.reduction_source = {}
# stack used to keep the parent node id
self.stack = []
# visit the AST
self.visit(self.ast_tree)
# visit the name node
def visit_Name(self, node):
# append the return ids into self.returns
if self.stack[-1] == "return":
self.returns.append(node.id)
else:
# accum is produced from accumulator node
if node.id == "accum":
name_node = AccumulatorNode(
self.element_accumulator, self.elements_per_access, node)
else:
# for input nodes
if node.id in self.input_args.keys():
type = self.input_args[node.id][0]
if type == "tensor":
name_node = TensorInputNode(self.element_accumulator, node)
elif type == "row":
name_node = RowBroadcastNode(self.element_accumulator, self.element_compute, node)
elif type == "column":
name_node = ColumnBroadcastNode(self.element_accumulator, self.element_compute, node)
elif type == "scalar":
name_node = ScalarInputNode(node)
else:
raise ValueError(type)
# for output nodes
else:
name_node = TensorOutputNode(self.element_accumulator, node)
self.epilogue_tree.create_node(name_node.tag, name_node.id, data=name_node, parent=self.stack[-1])
def visit_Assign(self, node):
pre_assign_node = self.epilogue_tree.get_node(node.targets[0].id)
if pre_assign_node is None:
# The assign is to a root node
# skip the reduction nodes
if isinstance(node.value, ast.Call):
if isinstance(node.value.func, ast.Name):
func_type = node.value.func.id
elif isinstance(node.value.func, ast.Attribute):
func_type = node.value.func.value.id
else:
raise TypeError
if func_type == 'reduction_op':
self.reduction_source[node.value.args[0].id] = [node.value.args[1].value, node.value.args[2].value, node.targets[0].id]
return
name_node = TensorOutputNode(self.element_accumulator, node)
self.epilogue_tree.create_node(name_node.tag, name_node.id, data=name_node)
self.stack.append(name_node.id)
else:
if node.targets[0].id in self.returns or node.targets[0].id in self.reduction_source.keys():
self.stack.append(node.targets[0].id)
else:
self.stack.append(pre_assign_node.predecessor(self.epilogue_tree.identifier))
self.epilogue_tree.remove_node(node.targets[0].id)
# get child tag
self.visit(node.value)
self.stack.pop()
def visit_Call(self, node):
if isinstance(node.func, ast.Name):
func_type = node.func.id
elif isinstance(node.func, ast.Attribute):
func_type = node.func.value.id
else:
raise TypeError
if func_type == "reduction_op":
self.visit(node.args[0])
else:
arg_list = []
for idx, arg in enumerate(node.args):
if idx == 0: continue
if isinstance(arg, ast.Constant):
arg_list.append(arg.value)
elif isinstance(arg, ast.Name):
arg_list.append(arg.id)
else:
raise TypeError
unary_node = UnaryNode(self.element_accumulator, self.element_compute, self.elements_per_access, node, arg_list)
self.epilogue_tree.create_node(unary_node.tag, unary_node.id, parent=self.stack[-1], data=unary_node)
self.stack.append(unary_node.id)
self.visit(node.args[0])
self.stack.pop()
def visit_BinOp(self, node):
binop = BinOpNode(self.element_accumulator, self.element_compute,
self.elements_per_access, node)
self.epilogue_tree.create_node(binop.tag, binop.id, data=binop, parent=self.stack[-1])
self.stack.append(binop.id)
self.visit(node.left)
self.visit(node.right)
self.stack.pop()
def visit_Return(self, node):
self.stack.append("return")
self.visit(node.value)
self.stack.pop()
# # A function definition
def visit_FunctionDef(self, node: ast.FunctionDef):
# visit args
for arg in node.args.args:
if arg.arg == "self": continue
if isinstance(arg.annotation, ast.Constant):
self.input_args[arg.arg] = [arg.annotation.value, ]
# visit the assign in the reverse order
for idx in range(len(node.body)):
self.visit(node.body[-1-idx])
#
# Tree optimization pass
#
# pass 1: lower Binary to Unary
def pass_binary_2_unary(self, tree, nid):
node = tree.get_node(nid)
if isinstance(node.data, BinOpNode):
lhs_node = tree.get_node(node.successors(tree.identifier)[0])
left_type = lhs_node.data.type
rhs_node = tree.get_node(node.successors(tree.identifier)[1])
right_type = rhs_node.data.type
if left_type == "scalar" and right_type == "tensor":
node.data = UnaryNode(
self.element_accumulator, self.element_compute,
self.elements_per_access,
node.data, [lhs_node.data.id,])
node.tag = node.data.tag
tree.remove_node(lhs_node.data.id)
self.pass_binary_2_unary(tree, rhs_node.data.id)
elif left_type == "tensor" and right_type == "scalar":
node.data = UnaryNode(
self.element_accumulator, self.element_compute,
self.elements_per_access,
node.data, [rhs_node.id,])
node.tag = node.data.tag
tree.remove_node(rhs_node.data.id)
self.pass_binary_2_unary(tree, lhs_node.data.id)
else:
self.pass_binary_2_unary(tree, lhs_node.data.id)
self.pass_binary_2_unary(tree, rhs_node.data.id)
else:
for child in node.successors(tree.identifier):
self.pass_binary_2_unary(tree, child)
# pass 2: inject reduction nodes
def pass_inject_reduction(self, tree, nid):
node = tree.get_node(nid)
if isinstance(node.data, TensorOutputNode):
if node.data.id in self.reduction_source.keys():
direction = self.reduction_source[node.data.id][0]
target = self.reduction_source[node.data.id][-1]
if direction == 'row':
reduction_node = RowReductionNode(
self.element_accumulator, self.element_output,
self.element_accumulator, target, self.tile_description.threadblock_shape[1])
elif direction == "column":
reduction_node = ColumnReductionNode(
self.element_accumulator, self.element_output,
self.element_accumulator, target, self.tile_description.threadblock_shape[0])
else:
raise ValueError(direction)
child_nid = node.successors(tree.identifier)[0]
# if this output node is injected only for reduction
if node.data.id not in self.returns:
# get reduction config from disc
node.data = reduction_node
node.tag = reduction_node.tag
self.pass_inject_reduction(tree, child_nid)
# if this output node is also a tensor output, inject reduction as its children
else:
# get child node
tree.create_node(reduction_node.tag, reduction_node.id, data=reduction_node, parent=node.data.id)
tree.move_node(child_nid, reduction_node.id)
child = tree.get_node(child_nid)
for grand_child in child.successors(tree.identifier):
self.pass_inject_reduction(tree, grand_child)
else:
for child in node.successors(tree.identifier):
self.pass_inject_reduction(tree, child)
else:
for child in node.successors(tree.identifier):
self.pass_inject_reduction(tree, child)
def pass_inject_epilogue_op(self, tree, nid):
node = tree.get_node(nid)
visitors = []
for child in node.successors(tree.identifier):
visitors.append(self.pass_inject_epilogue_op(tree, child))
node.data.get_epilogue_node(visitors)
return node.data.epilogue_node
def get_arguments(self, tree, nid, kwargs):
node = tree.get_node(nid)
visitor_args = []
for child in node.successors(tree.identifier):
visitor_args.append(self.get_arguments(tree, child, kwargs))
node.data.get_argument(visitor_args, kwargs)
return node.data.argument
class EpilogueVisitTree:
KernelTemplate = """
${visitor}
using ${operation_name}_EpilogueVisitor = cutlass::epilogue::threadblock::EpilogueVisitorGeneric<${visitor_name}>;
"""
def __init__(self, elementwise_functor, tile_description,
element_accumulator, elements_per_access,
element_compute, element_output) -> None:
#
# data types
self.tile_description = tile_description
self.element_accumulator = element_accumulator
self.elements_per_access = elements_per_access
self.element_compute = element_compute
self.element_output = element_output
self.elementwise_functor = elementwise_functor
pass
def initialize(self):
function = EpilogueAST(self, self.tile_description,
self.element_accumulator, self.elements_per_access,
self.element_compute, self.element_output)
#
tree = function.epilogue_tree
self.tree = tree
function.pass_binary_2_unary(self.tree, self.tree.root)
function.pass_inject_reduction(self.tree, self.tree.root)
function.pass_inject_epilogue_op(self.tree,self.tree.root)
visitor = self.tree.get_node(self.tree.root).data.epilogue_node
self.visitor = visitor
class _Argument(ctypes.Structure):
_fields_ = [
("visitor_arg", visitor.argument_type)
]
def __init__(self, **kwargs) -> None:
# process input args
_kwargs = {}
for input_key in function.input_args.keys():
if input_key == "accum":
continue
if function.input_args[input_key][0] == "scalar":
continue
# tensor input
else:
setattr(self, "buffer_tensor_" + input_key, NumpyFrontend.argument(kwargs[input_key], False))
setattr(self, input_key + "_ptr", int(getattr(self, "buffer_tensor_" + input_key).ptr))
_kwargs[input_key+"_ptr"] = getattr(self, input_key + "_ptr")
# process the return args
for ret in function.returns:
setattr(self, "buffer_tensor_" + ret, NumpyFrontend.argument(kwargs[ret], True))
setattr(self, ret + "_ptr", int(getattr(self, "buffer_tensor_" + ret).ptr))
_kwargs[ret+"_ptr"] = getattr(self, ret + "_ptr")
setattr(self, "host_tensor_" + ret, kwargs[ret])
_kwargs.update(kwargs)
function.get_arguments(tree, tree.root, _kwargs)
self.visitor_arg = tree.get_node(tree.root).data.argument
def sync(self, stream_sync=True):
if stream_sync:
err, = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
for ret in function.returns:
err, = cuda.cuMemcpyDtoH(
getattr(self, "host_tensor_" + ret), cuda.CUdeviceptr(getattr(self, ret + "_ptr")),
getattr(self, "host_tensor_" + ret).size * getattr(self, "host_tensor_" + ret).itemsize
)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
pass
self.epilogue_type = _Argument
def emit(self, operation):
values = {
'visitor': self.visitor.emit(operation),
'operation_name': operation.procedural_name(),
'visitor_name': self.visitor.instance_name
}
return SubstituteTemplate(self.KernelTemplate, values)

View File

@ -1,398 +0,0 @@
################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
from pycutlass import *
from pycutlass.c_types import get_reduction_params
import cutlass
from cuda import cuda
try:
import torch
torch_available = True
except ImportError:
torch_available = False
import numpy as np
from typing import Union
from cuda import cudart
class ReductionOperation:
pass
class ReductionArguments:
"""
Arguments of reduction
"""
def __init__(self, operation: ReductionOperation,
problem_size: 'list[int]', partitions: int,
workspace: cuda.CUdeviceptr,
destination: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
source: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]', **kwargs) -> None:
# tensor_C can be interpreted as the bias with bias=True in keyword args
if "bias" in kwargs.keys():
self.bias = kwargs["bias"]
else:
# by default, tensor_C is not bias
self.bias = False
self.operation = operation
#: pointer to the workspace
self.ptr_workspace = workspace
#: number of split-k partitions
self.partitions = partitions
if isinstance(destination, np.ndarray):
self.host_D = destination
self.destination_buffer = NumpyFrontend.argument(destination, True)
self.source_buffer = NumpyFrontend.argument(source, False)
self.ptr_destination = cuda.CUdeviceptr(
self.destination_buffer.ptr)
self.ptr_source = cuda.CUdeviceptr(self.source_buffer.ptr)
elif torch_available and isinstance(destination, torch.Tensor):
self.ptr_destination = TorchFrontend.argument(destination)
self.ptr_source = TorchFrontend.argument(source)
elif isinstance(destination, cuda.CUdeviceptr):
self.ptr_destination = destination
self.ptr_source = source
else:
raise TypeError("unknown Type")
self.problem_size = MatrixCoord_(
problem_size[0], problem_size[1]
)
self.partition_stride = problem_size[0] * \
problem_size[1] * DataTypeSize[operation.C.element] // 8
if "output_op" in kwargs.keys():
self.output_op = kwargs['output_op']
else:
self.output_op = self.operation.epilogue_type(1.0, 0.0)
# get arguments
self.get_arguments()
@staticmethod
def get_tensor_ref(extent: 'tuple[int]', device_ptr: cuda.CUdeviceptr, layout: cutlass.layout):
if layout == cutlass.RowMajor:
return TensorRef2D_(int(device_ptr), extent[1])
else:
raise ValueError("unknown layout type")
def get_arguments(self):
ref_workspace = ReductionArguments.get_tensor_ref(
extent=[self.problem_size.row, self.problem_size.column],
device_ptr=self.ptr_workspace, layout=cutlass.RowMajor)
if self.bias:
ref_source = ReductionArguments.get_tensor_ref(
extent=[0, 0],
device_ptr=self.ptr_source, layout=cutlass.RowMajor)
else:
ref_source = ReductionArguments.get_tensor_ref(
extent=[self.problem_size.row, self.problem_size.column],
device_ptr=self.ptr_source, layout=cutlass.RowMajor)
ref_destination = ReductionArguments.get_tensor_ref(
extent=[self.problem_size.row, self.problem_size.column],
device_ptr=self.ptr_destination, layout=cutlass.RowMajor)
self.c_arguments = self.operation.argument_type(
self.problem_size, self.partitions,
self.partition_stride, ref_workspace,
ref_destination, ref_source,
self.output_op
)
params_ = self.operation.rt_module.get_args(
ctypes.byref(self.c_arguments))
self.host_workspace = bytearray(params_.contents)
def sync(self):
err, = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
if hasattr(self, "host_D"):
err, = cuda.cuMemcpyDtoH(
self.host_D, self.ptr_destination, self.host_D.size * self.host_D.itemsize)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
def free(self):
if hasattr(self, "destination_buffer"):
del self.destination_buffer
if hasattr(self, "source_buffer"):
del self.source_buffer
class ReductionRT(ExecutableOperation):
"""
ReductionRT manages the CUTLASS runtime components for reduction
"""
KernelTemplate = r'''
extern "C"
__global__ void
${operation_name}(${operation_name}${operation_suffix}::Params params) {
// Dynamic shared memory base pointer
extern __shared__ int SharedStorageBase[];
// Declare pointer to dynamic shared memory.
${operation_name}${operation_suffix}::SharedStorage *shared_storage =
reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
${operation_name}${operation_suffix} op;
op(params, *shared_storage);
}
'''
HostTemplate = r'''
extern "C" {
// Get the size of params in bytes
int ${operation_name}_get_param_size(){
return sizeof(${operation_name}${operation_suffix}::Params);
}
// Get the size of dynamic shared memory in bytes
int ${operation_name}_shared_memory_size() {
return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
}
// Get the params as byte array
char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Params* params){
char *bytes = ((char*)(params));
char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
output[i] = bytes[i];
return output;
}
}
'''
def __init__(self, operation: ReductionOperation):
super().__init__(operation)
self.operation: ReductionOperation = operation
self.emitter = EmitReductionInstance('_type')
self.elements_per_access = self.operation.count
self.argument_type, self.epilogue_type = get_reduction_params(operation.epilogue_functor)
self.argtype = [ctypes.POINTER(self.argument_type)]
def emit(self):
return self.emitter.emit(self.operation)
def plan(self, arguments: ReductionArguments):
block_shape = [self.operation.shape.column(
) // self.elements_per_access, self.operation.shape.row(), 1]
grid_shape = [
(arguments.problem_size.row + self.operation.shape.row() -
1) // self.operation.shape.row(),
(arguments.problem_size.column + self.operation.shape.column() -
1) // self.operation.shape.column(),
1
]
return LaunchConfiguration(grid_shape, block_shape, self.shared_memory_capacity)
def initialize(self):
err, = cuda.cuFuncSetAttribute(
self.kernel,
attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
value=self.shared_memory_capacity)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('Cuda Error: {}'.format(err))
class ReductionOperation:
"""
CUTLASS Reduction Operation
shape: shape of CTA
outputop: output operator
r
"""
def __init__(self, shape: cutlass.MatrixCoord, C: TensorDescription,
element_accumulator, element_workspace=None,
element_compute=None, epilogue_functor=None,
count: int = 1, partitions_per_stage: int = 4) -> None:
""" Constructor
"""
self.shape = shape
#: epilogue functor (default: LinearCombination)
self.epilogue_functor = epilogue_functor
#: datatype of accumulator
self.element_accumulator = element_accumulator
if element_workspace is None:
#: datatype of workspace
self.element_workspace = element_accumulator
else:
#: datatype of workspace
self.element_workspace = element_workspace
if element_compute is None:
#: datatype of workspace
self.element_compute = element_accumulator
else:
#: datatype of workspace
self.element_compute = element_compute
#: datatype of output
self.element_output = C.element
#: operand C
self.C: TensorDescription = C
#: reduce op processing size
self.count: int = count
#: number of partitions to reduce per stage
self.partitions_per_stage: int = partitions_per_stage
self.rt_module: ReductionRT = ReductionRT(self)
self.argument_type = self.rt_module.argument_type
self.epilogue_type = self.rt_module.epilogue_type
#
def extended_name(self):
extend_name = "${element_workspace}_${element_accumulator}_${element_compute}_${element_output}"
return SubstituteTemplate(extend_name,
{
'element_workspace': DataTypeNames[self.element_workspace],
'element_accumulator': DataTypeNames[self.element_accumulator],
'element_compute': DataTypeNames[self.element_compute],
'element_output': DataTypeNames[self.element_output]
})
#
def configuration_name(self):
''' The full procedural name indicates architecture, extended name, tile size'''
configuration_name = "cutlass_reduce_split_k_${extended_name}_${threadblock}"
threadblock = "%dx%d" % (
self.shape.row(),
self.shape.column()
)
return SubstituteTemplate(
configuration_name,
{
'extended_name': self.extended_name(),
'threadblock': threadblock
}
)
#
def procedural_name(self):
''' The full procedural name indicates architecture, extended name, tile size'''
return self.configuration_name()
def run(self, arguments: ReductionArguments) -> cuda.CUresult:
"""
Configure and launch the cuda kernel with input arguments
"""
# get launch configuration
launch_config = self.rt_module.plan(arguments)
# get the host and device workspace
host_workspace = arguments.host_workspace
device_workspace = None
# launch the kernel
err = self.rt_module.run(
host_workspace, device_workspace, launch_config)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError('CUDA Error %s' % str(err))
return err
class EmitReductionInstance:
def __init__(self, operation_suffix='') -> None:
self.operation_suffix = operation_suffix
self.includes = [
"cutlass/cutlass.h",
"cutlass/numeric_types.h",
"cutlass/arch/arch.h",
"cutlass/arch/mma.h",
"cutlass/layout/matrix.h",
"cutlass/gemm/device/gemm.h",
"cutlass/gemm/device/gemm_universal_adapter.h",
"cutlass/gemm/kernel/default_gemm_universal.h",
"cutlass/reduction/kernel/reduce_split_k.h",
"cutlass/reduction/thread/reduction_operators.h"
]
self.template = """
// Reduction kernel instance
using ${operation_name}_base =
typename cutlass::reduction::kernel::ReduceSplitK<
cutlass::MatrixShape<${shape_row}, ${shape_column}>,
${epilogue_functor},
cutlass::reduction::thread::ReduceAdd<
${element_accumulator},
${element_output},
${count}>,
${partition_per_stage}>;
struct ${operation_name}${operation_suffix}:
public ${operation_name}_base { };
"""
def emit(self, operation: ReductionOperation):
epilogue_vector_length = int(min(
operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
values = {
'operation_name': operation.configuration_name(),
'operation_suffix': self.operation_suffix,
'shape_row': str(operation.shape.row()),
'shape_column': str(operation.shape.column()),
'epilogue_functor': operation.epilogue_functor.emit(),
'element_output': DataTypeTag[operation.element_output],
'epilogue_vector_length': str(epilogue_vector_length),
'element_accumulator': DataTypeTag[operation.element_accumulator],
'element_compute': DataTypeTag[operation.element_compute],
'element_workspace': DataTypeTag[operation.element_workspace],
'count': str(operation.count),
'partition_per_stage': str(operation.partitions_per_stage)
}
return SubstituteTemplate(self.template, values)

View File

@ -1,70 +0,0 @@
################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
from typeguard import typechecked
import numpy as np
try:
import torch
torch_available = True
except ImportError:
torch_available = False
from cuda import cuda
try:
import cupy as cp
cupy_available = True
except ImportError:
cupy_available = False
import cutlass
# @typechecked
class TensorRef:
"""
Python Wrapper for cutlass.TensorRef
"""
def __init__(self, tensor, dtype, layout) -> None:
if isinstance(tensor, np.ndarray):
ptr = cuda.CUdeviceptr(tensor.__array_interface__['data'][0])
elif torch_available and isinstance(tensor, torch.Tensor):
ptr = cuda.CUdeviceptr(tensor.data_ptr())
elif cupy_available and isinstance(tensor, cp.ndarray):
ptr = cuda.CUdeviceptr(int(tensor.data.ptr))
elif isinstance(tensor, cuda.CUdeviceptr):
ptr = tensor
elif isinstance(tensor, int):
ptr = cuda.CUdeviceptr(tensor)
else:
raise NotImplementedError(tensor)
# the dtype(0) is used to overload between different data types
# with the same layout
self.tensor_ref = cutlass.get_tensor_ref(int(ptr), dtype(0), layout)

View File

@ -1,4 +0,0 @@
from pycutlass.test.profiler import *
from pycutlass.test.conv2d_testbed import *
from pycutlass.test.gemm_testbed import *
from pycutlass.test.gemm_grouped_testbed import *

View File

@ -1,632 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import pycutlass
from pycutlass import *
from pycutlass.test import *
from time import sleep
from bfloat16 import bfloat16
import subprocess
from typeguard import typechecked
import re
def getTensorRef(tensor, tensor_layout, conv_kind, problem_size, operand):
ptr = tensor.__array_interface__['data'][0]
if operand == "a":
tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(conv_kind, problem_size)
elif operand == "b":
tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(conv_kind, problem_size)
elif operand in ["c", "d"]:
tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(conv_kind, problem_size)
else:
raise ValueError("unknown operand: " + operand)
layout = tensor_layout.packed(tensor_coord)
if tensor.dtype == np.float64:
return cutlass.TensorRefF64NHWC(ptr, layout)
elif tensor.dtype == np.float32:
return cutlass.TensorRefF32NHWC(ptr, layout)
elif tensor.dtype == np.float16:
return cutlass.TensorRefF16NHWC(ptr, layout)
if tensor.dtype == bfloat16:
return cutlass.TensorRefBF16NHWC(ptr, layout)
elif tensor.dtype == np.int32:
return cutlass.TensorRefS32NHWC(ptr, layout)
elif tensor.dtype == np.int8:
if tensor_layout == cutlass.TensorNC32HW32:
return cutlass.TensorRefS8NC32HW32(ptr, layout)
elif tensor_layout == cutlass.TensorC32RSK32:
return cutlass.TensorRefS8C32RSK32(ptr, layout)
else:
return cutlass.TensorRefS8NHWC(ptr, layout)
else:
raise ValueError("unsupported data type")
def getTensorView(tensor, tensor_layout, conv_kind, problem_size, operand):
tensor_ref = getTensorRef(tensor, tensor_layout, conv_kind, problem_size, operand)
if operand == "a":
tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(conv_kind, problem_size)
elif operand == "b":
tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(conv_kind, problem_size)
elif operand in ["c", "d"]:
tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(conv_kind, problem_size)
else:
raise ValueError("unknown operand: " + operand)
if tensor.dtype == np.float64:
return cutlass.TensorViewF64NHWC(tensor_ref, tensor_coord)
elif tensor.dtype == np.float32:
return cutlass.TensorViewF32NHWC(tensor_ref, tensor_coord)
elif tensor.dtype == np.float16:
return cutlass.TensorViewF16NHWC(tensor_ref, tensor_coord)
elif tensor.dtype == bfloat16:
return cutlass.TensorViewBF16NHWC(tensor_ref, tensor_coord)
elif tensor.dtype == np.int32:
return cutlass.TensorViewS32NHWC(tensor_ref, tensor_coord)
elif tensor.dtype == np.int8:
if tensor_layout == cutlass.TensorNC32HW32:
return cutlass.TensorViewS8NC32HW32(tensor_ref, tensor_coord)
elif tensor_layout == cutlass.TensorC32RSK32:
return cutlass.TensorViewS8C32RSK32(tensor_ref, tensor_coord)
else:
return cutlass.TensorViewS8NHWC(tensor_ref, tensor_coord)
else:
raise ValueError("unsupported data type")
# @typechecked
class Conv2dLauncher:
"""
Launcher that runs the operation on given problem size
"""
def __init__(self, operation: 'Conv2dOperation', seed: int=2080, interleaved=False,
verification=True, profiling=False, warmup_iterations=500, iterations=500, **kwargs) -> None:
self.enable_cached_results = True
self.interleaved = interleaved
# create the reduction kernel
self.reduction_operation = ReductionOperation(
shape=cutlass.MatrixCoord(4, 32 * operation.C.alignment),
C=operation.C, element_accumulator=operation.tile_description.math_instruction.element_accumulator,
element_compute=operation.epilogue_functor.element_epilogue, epilogue_functor=operation.epilogue_functor,
count=operation.C.alignment
)
#: verify the output result
self.verification = verification
#: profile the kernel's runtime
self.profiling = profiling
self.timer = GpuTimer()
self.warmup_iterations = warmup_iterations
self.iterations = iterations
if "sleep" in kwargs.keys():
self.sleep_time = kwargs["sleep"]
else:
self.sleep_time = 0
#
# Compile the operator
#
pycutlass.compiler.add_module([operation, self.reduction_operation])
self.operation = operation
self.dtype_A = Conv2dLauncher.numpy_type(operation.A.element)
self.layout_A = operation.A.layout
self.dtype_B = Conv2dLauncher.numpy_type(operation.B.element)
self.layout_B = operation.B.layout
self.dtype_C = Conv2dLauncher.numpy_type(operation.C.element)
self.layout_C = operation.C.layout
self.dtype_D = Conv2dLauncher.numpy_type(operation.C.element)
self.layout_D = operation.C.layout
accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
element_size = DataTypeSize[operation.A.element]
if element_size <= 8:
self.scope = 1
elif element_size == 16:
if accumulator_size <= 16:
self.scope = 2
else:
self.scope = 4
else:
self.scope = 7
# Seed
self.seed = seed
self.conv_kind = operation.conv_kind
#
# Get the host reference function
#
self.element_compute = operation.epilogue_functor.element_epilogue
self.host_conv2d = cutlass.test.conv.host.conv2d
self.timer = GpuTimer()
@staticmethod
def numpy_type(type):
if type == cutlass.float64:
return np.float64
elif type == cutlass.float32:
return np.float32
elif type == cutlass.float16:
return np.float16
elif type == cutlass.bfloat16:
return bfloat16
elif type == cutlass.int32:
return np.int32
elif type == cutlass.int8:
return np.int8
else:
raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
def print_problem_size(self, p, split_k_mode=1):
print("nhwc_%dx%dx%dx%d_krsc_%dx%dx%dx%d_padding_%dx%d_stride_%dx%d_dilation_%dx%d_splitkslices_%d_splitkmode_%d"
% (p.N, p.H, p.W, p.C, p.K, p.R, p.S, p.C, p.pad_h,
p.pad_w, p.stride_h, p.stride_w, p.dilation_h, p.dilation_w, p.split_k_slices, split_k_mode))
def uniform_init(self, size, dtype):
if dtype in [np.float32, np.float16, bfloat16, np.float64]:
return np.ceil(
np.random.uniform(
low=-self.scope - 0.5, high=self.scope - 0.5,
size=size).astype(dtype)
)
else:
return np.random.uniform(
low=-self.scope - 1, high=self.scope + 1,
size=size).astype(dtype)
def eq_gemm_size(self, problem_size):
n = problem_size.N
p = problem_size.P
q = problem_size.Q
k = problem_size.K
r = problem_size.R
s = problem_size.S
c = problem_size.C
h = problem_size.H
w = problem_size.W
if self.conv_kind == cutlass.conv.Operator.fprop:
return cutlass.gemm.GemmCoord(n * p * q, k, r * s * c)
elif self.conv_kind == cutlass.conv.Operator.dgrad:
return cutlass.gemm.GemmCoord(n * h * w, c, k * r * s)
else:
return cutlass.gemm.GemmCoord(k, r * s * c, n * p * q)
def bytes(self, problem_size, alpha, beta):
mnk = self.eq_gemm_size(problem_size)
bytes_ = \
(DataTypeSize[self.operation.A.element] * mnk.m() // 8) * mnk.k() + \
(DataTypeSize[self.operation.B.element] * mnk.n() // 8) * mnk.k() + \
(DataTypeSize[self.operation.C.element] * mnk.m() // 8) * mnk.n()
if beta != 0:
bytes_ += (DataTypeSize[self.operation.C.element] * mnk.m() // 8) * mnk.n()
return bytes_
def flops(self, problem_size):
mnk = self.eq_gemm_size(problem_size)
flops_mainloop_ = mnk.m() * mnk.n() * mnk.k() * 2
flops_epilogue_ = mnk.m() * mnk.n() * 2
# Adjust mainloop flop for dgrad stride
if self.conv_kind == cutlass.conv.Operator.dgrad:
flops_mainloop_ = flops_mainloop_ // (problem_size.stride_h * problem_size.stride_w)
flops_total_ = flops_mainloop_ + flops_epilogue_
return flops_total_
def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
if self.element_compute == cutlass.float16:
alpha = cutlass.float16(alpha)
beta = cutlass.float16(beta)
elif self.element_compute == cutlass.int32:
alpha = int(alpha)
beta = int(beta)
else:
alpha = alpha
beta = beta
# if cached result is loaded
cached_result_loaded = False
if self.enable_cached_results:
# get problem key
cached_test_key = cutlass.test.conv.host.CreateCachedConv2dTestKey(
self.conv_kind, problem_size, alpha, beta,
getTensorView(tensor_A, self.layout_A, self.conv_kind, problem_size, "a"),
getTensorView(tensor_B, self.layout_B, self.conv_kind, problem_size, "b"),
getTensorView(tensor_C, self.layout_C, self.conv_kind, problem_size, "c"),
)
cached_test_result = cutlass.test.conv.host.CachedTestResult()
conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (self.operation.arch, self.seed)
cached_results = cutlass.test.conv.host.CachedTestResultListing(conv2d_result_cache_name)
# CachedTestResultListing cached_results(conv2d_result_cache_name);
cached = cached_results.find(cached_test_key)
cached_result_loaded = cached[0]
if cached_result_loaded :
cached_test_result = cached[1]
if not cached_result_loaded:
# compute the conv2d on host
tensor_D_ref = np.ones_like(tensor_C)
tensor_ref_A = getTensorRef(tensor_A, self.layout_A, self.conv_kind, problem_size, "a")
tensor_ref_B = getTensorRef(tensor_B, self.layout_B, self.conv_kind, problem_size, "b")
tensor_ref_C = getTensorRef(tensor_C, self.layout_C, self.conv_kind, problem_size, "c")
tensor_ref_D_ref = getTensorRef(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
self.host_conv2d(
self.conv_kind, problem_size,
tensor_ref_A, tensor_ref_B, tensor_ref_C, tensor_ref_D_ref,
alpha, beta
)
tensor_view_D_ref = getTensorView(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
if self.enable_cached_results:
cached_test_result.D = cutlass.test.conv.host.TensorHash(tensor_view_D_ref)
cached_results = cutlass.test.conv.host.CachedTestResultListing(conv2d_result_cache_name)
cached_results.append(cached_test_key, cached_test_result)
cached_results.write(conv2d_result_cache_name)
else:
return tensor_D_ref
return cached_test_result.D
def equal(self, tensor_D, tensor_D_ref, problem_size):
if self.enable_cached_results:
tensor_view_D = getTensorView(tensor_D, self.layout_D, self.conv_kind, problem_size, "d")
tensor_D_hash = cutlass.test.conv.host.TensorHash(tensor_view_D)
return tensor_D_hash == tensor_D_ref
else:
tensor_view_D = getTensorView(tensor_D, self.layout_D, self.conv_kind, problem_size, "d")
tensor_view_D_ref = getTensorView(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
return cutlass.test.conv.host.equals(tensor_view_D, tensor_view_D_ref)
def run_cutlass_profiler(self, problem_size, split_k_mode=cutlass.conv.SplitKMode.Serial, alpha=1.0, beta=0.0):
if split_k_mode == cutlass.conv.SplitKMode.Serial:
split_k_mode_ = "serial"
else:
split_k_mode_ = "parallel"
cutlass_path = os.getenv('CUTLASS_PATH')
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
values = {
"profiler_path": cutlass_path + "/build/tools/profiler/cutlass_profiler",
"kernel_name": self.operation.procedural_name(),
"verification_providers": "device",
"provider": "cutlass",
'n': str(problem_size.N),
'h': str(problem_size.H),
'w': str(problem_size.W),
'c': str(problem_size.C),
'k': str(problem_size.K),
'r': str(problem_size.R),
's': str(problem_size.S),
'p': str(problem_size.P),
'q': str(problem_size.Q),
'pad_h': str(problem_size.pad_h),
'pad_w': str(problem_size.pad_w),
'stride_h': str(problem_size.stride_h),
'stride_w': str(problem_size.stride_w),
'dilation_h': str(problem_size.dilation_h),
'dilation_w': str(problem_size.dilation_w),
'split_k_slices': str(problem_size.split_k_slices),
'split_k_mode': split_k_mode_,
'alpha': str(alpha),
'beta': str(beta),
'warmup': str(self.warmup_iterations),
'profile': str(self.iterations)
}
cmd_template = \
"${profiler_path} --kernels=${kernel_name} --verification-providers=${verification_providers}" \
" --providers=${provider} --n=${n} --h=${h} --w=${w} --c=${c} --k=${k} --r=${r} --s=${s} --p=${p}" \
" --q=${q} --pad_h=${pad_h} --pad_w=${pad_w} --stride_h={stride_h} --stride_w=${stride_w}" \
" --dilation_h=${dilation_h} --dilation_w=${dilation_w} --warmup-iterations=${warmup} --profiling-iterations=${profile}" \
" --split_k_slices=${split_k_slices} --alpha=${alpha} --beta=${beta} --split_k_mode=${split_k_mode}"
cmd = SubstituteTemplate(cmd_template, values)
result = subprocess.getoutput(cmd)
m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
runtime = float(m.group('runtime'))
m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
bytes = int(m.group('bytes'))
m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
flops = int(m.group('flops'))
# check if the problem size matches
assert bytes == self.bytes(problem_size, alpha, beta)
assert flops == self.flops(problem_size)
return runtime
def run(self, problem_size, split_k_mode=cutlass.conv.SplitKMode.Serial,
alpha=1.0, beta=0.0):
assert get_allocated_size() == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
#
# Initialize input and output tensors
#
tensor_A_size = cutlass.conv.implicit_gemm_tensor_a_size(self.conv_kind, problem_size)
tensor_B_size = cutlass.conv.implicit_gemm_tensor_b_size(self.conv_kind, problem_size)
tensor_C_size = cutlass.conv.implicit_gemm_tensor_c_size(self.conv_kind, problem_size)
np.random.seed(self.seed)
tensor_A = self.uniform_init(size=(tensor_A_size,), dtype=self.dtype_A)
tensor_B = self.uniform_init(size=(tensor_B_size,), dtype=self.dtype_B)
tensor_C = self.uniform_init(size=(tensor_C_size,), dtype=self.dtype_C)
tensor_D = np.zeros(shape=(tensor_C_size,), dtype=self.dtype_D)
#
# Launch kernel
#
arguments = Conv2dArguments(
operation=self.operation, problem_size=problem_size, A=tensor_A,
B=tensor_B, C=tensor_C, D=tensor_D,
output_op = self.operation.epilogue_type(alpha, beta),
split_k_slices=problem_size.split_k_slices,
split_k_mode=split_k_mode
)
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
implicit_gemm_size = cutlass.conv.implicit_gemm_problem_size(self.operation.conv_kind, arguments.problem_size)
reduction_arguments = ReductionArguments(
self.reduction_operation,
problem_size=[implicit_gemm_size.m(), implicit_gemm_size.n()], partitions=problem_size.split_k_slices,
workspace=arguments.ptr_D,
destination=tensor_D,
source=tensor_C,
output_op = self.reduction_operation.epilogue_type(alpha, beta)
)
self.operation.run(arguments)
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
self.reduction_operation.run(reduction_arguments)
passed = True
if self.verification:
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
reduction_arguments.sync()
else:
arguments.sync()
tensor_D_ref = self.host_reference(problem_size, tensor_A, tensor_B, tensor_C, alpha, beta)
passed = self.equal(tensor_D, tensor_D_ref, problem_size)
try:
assert passed
except AssertionError:
self.print_problem_size(problem_size, split_k_mode)
if self.profiling:
sleep(self.sleep_time)
for _ in range(self.warmup_iterations):
self.operation.run(arguments)
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
self.reduction_operation.run(reduction_arguments)
self.timer.start()
for _ in range(self.warmup_iterations):
self.operation.run(arguments)
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
self.reduction_operation.run(reduction_arguments)
self.timer.stop_and_wait()
runtime = self.timer.duration(self.iterations)
# free memory
del arguments
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
del reduction_arguments
assert get_allocated_size() == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
if self.profiling:
return runtime
return passed
########################################################################################################
# TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
# TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
# Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
# (conv_blacklist_sizes)
############################################################################################################
def test_all_conv2d(operation: Conv2dOperation, conv_test_sizes = [], interleaved=False):
passed = True
#
# Testbed object
#
testbed = Conv2dLauncher(operation, interleaved=interleaved)
#
# Get conv problem sizes to run conv operator
#
conv_problems = cutlass.test.conv.TestbedConv2dProblemSizes(64)
# Vector of conv2d problem sizes to avoid duplicate runs
conv_tested_sizes = []
# Flatten 2D problem_vectors into a 1D problem sizes
problem_sizes = conv_problems.conv2d_default_sizes
problem_sizes = [conv_problem for conv_problem in problem_sizes] + conv_test_sizes
# Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slices=1, alpha=1.0, beta=0.0)
for conv_problem in problem_sizes:
if conv_problem in conv_tested_sizes:
continue
# skip channel dimension % 32 != 0 for interleaved case
if interleaved:
if conv_problem.K % 32 != 0 or conv_problem.C % 32 != 0:
continue
#
# Procedurally disable certain cases
#
# CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1}
if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Unity:
if not ((conv_problem.stride_h == 1) and (conv_problem.stride_w == 1)):
continue
if not interleaved:
# Fixed channels algorithm requires channel count to match access size
if operation.iterator_algorithm == cutlass.conv.IteratorAlgorithm.fixed_channels:
if conv_problem.C != operation.A.alignment:
continue
# Few channels algorithm requires channel count to match access size
if operation.iterator_algorithm == cutlass.conv.IteratorAlgorithm.few_channels:
if conv_problem.C % operation.A.alignment:
continue
# CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w}
# Although strided dgrad works for all stride combinations, we are only going
# to run strided dgrad for non-unity strides
if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Strided:
if (conv_problem.stride_h == 1) and (conv_problem.stride_w == 1):
continue
#
# Test
#
# push back tested problem size to avoid re-running duplicates
conv_tested_sizes.append(conv_problem)
passed = testbed.run(conv_problem)
if not passed:
return False
if interleaved:
return True
#
# filter the cases for split K
#
# Small-channels convolution can't run here.
if operation.iterator_algorithm in [cutlass.conv.IteratorAlgorithm.fixed_channels, cutlass.conv.IteratorAlgorithm.few_channels]:
return True
# CUTLASS DGRAD's *stride* specialization does not support split-k mode
if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Strided:
conv_problem = cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 56, 56, 8),
cutlass.Tensor4DCoord(8, 1, 1, 8),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
)
passed = testbed.run(conv_problem)
return passed
# Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
# a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
# which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep
# alpha and beta for local testing, but only runs one value for alpha and beta.
conv2d_split_k_test_size = cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 17, 11, 288),
cutlass.Tensor4DCoord(160, 3, 3, 288),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
)
split_k_modes = [cutlass.conv.SplitKMode.Parallel, cutlass.conv.SplitKMode.Serial]
split_k_slices = [1, 2, 3, 4, 201]
problem_alpha = [2.0,]
problem_beta = [2.0,]
for split_k_mode in split_k_modes:
for split_k_slice in split_k_slices:
for alpha in problem_alpha:
for beta in problem_beta:
passed = testbed.run(conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
split_k_mode,
alpha, beta)
return passed

View File

@ -1,235 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import pycutlass
from pycutlass.test.gemm_testbed import getTensorRef, getTensorView, transpose
from pycutlass import *
import numpy as np
import cutlass
from bfloat16 import bfloat16
class TestbedGrouped:
def __init__(self, operation: GemmOperationGrouped, seed: int = 2080) -> None:
pycutlass.compiler.add_module([operation])
self.seed = seed
self.operation = operation
element_size = DataTypeSize[operation.A.element]
self.dtype_A = self.numpy_type(operation.A.element)
self.dtype_B = self.numpy_type(operation.B.element)
self.dtype_C = self.numpy_type(operation.C.element)
self.dtype_D = self.numpy_type(operation.C.element)
if element_size == 1:
self.scope_max = 1
self.scope_min = 0
elif element_size <= 8:
self.scope_max = 1
self.scope_min = -1
elif element_size == 16:
self.scope_max = 4
self.scope_min = -4
else:
self.scope_max = 8
self.scope_min = -8
#: compute type
self.compute_type = operation.epilogue_functor.element_epilogue
self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
@staticmethod
def numpy_type(type):
if type == cutlass.float64:
return np.float64
elif type == cutlass.float32:
return np.float32
elif type == cutlass.float16:
return np.float16
elif type == cutlass.bfloat16:
return bfloat16
elif type == cutlass.int32:
return np.int32
elif type == cutlass.int8:
return np.int8
else:
raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
def uniform_init(self, size, dtype):
if dtype in [np.float32, np.float16, bfloat16, np.float64]:
return np.ceil(
np.random.uniform(
low=self.scope_min - 0.5, high=self.scope_max - 0.5,
size=size).astype(dtype)
)
else:
return np.random.uniform(
low=self.scope_min - 1, high=self.scope_max + 1,
size=size).astype(dtype)
def print_problem_size(self, p):
problem_size = "problem: %d, %d, %d\n" % (p.m(), p.n(), p.k())
print(problem_size)
def run(self, problem_count: int, alpha: float = 1.0, beta: float = 0.0) -> bool:
assert get_allocated_size(
) == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
# initialize
np.random.seed(self.seed)
# generate the problem sizes
problem_sizes = []
tensor_As = []
tensor_Bs = []
tensor_Cs = []
tensor_Ds = []
tensor_D_refs = []
for i in range(problem_count):
if self.dtype_A == np.int8:
if i == 0:
problem_size = cutlass.gemm.GemmCoord(48, 16, 32)
else:
problem_size = cutlass.gemm.GemmCoord(
16 * np.random.randint(0, 64) + 48,
16 * np.random.randint(0, 64) + 48,
16 * np.random.randint(0, 64) + 48
)
else:
if i == 0:
problem_size = cutlass.gemm.GemmCoord(48, 16, 8)
else:
problem_size = cutlass.gemm.GemmCoord(
8 * np.random.randint(0, 64) + 24,
8 * np.random.randint(0, 64) + 24,
8 * np.random.randint(0, 64) + 24
)
tensor_As.append(
self.uniform_init(
size=(problem_size.m() * problem_size.k(),),
dtype=self.dtype_A)
)
tensor_Bs.append(
self.uniform_init(
size=(problem_size.n() * problem_size.k(),),
dtype=self.dtype_B)
)
tensor_Cs.append(
self.uniform_init(
size=(problem_size.m() * problem_size.n(),),
dtype=self.dtype_C)
)
tensor_Ds.append(
np.zeros(
shape=(problem_size.m() * problem_size.n(),),
dtype=self.dtype_D
)
)
tensor_D_refs.append(
np.ones(
shape=(problem_size.m() * problem_size.n(),),
dtype=self.dtype_D
)
)
problem_sizes.append(problem_size)
arguments = GemmGroupedArguments(
operation=self.operation, problem_sizes=problem_sizes,
A=tensor_As, B=tensor_Bs, C=tensor_Cs, D=tensor_Ds,
output_op=self.operation.epilogue_type(alpha, beta)
)
self.operation.run(arguments)
arguments.sync()
#
# Reference check
#
alpha = self.compute_type(alpha).value()
beta = self.compute_type(beta).value()
init_acc = self.accumulator_type(0).value()
for idx, problem_size in enumerate(problem_sizes):
if self.operation.switched:
tensor_ref_A = getTensorRef(
tensor_As[idx], problem_size, "a", transpose(self.operation.B.layout))
tensor_ref_B = getTensorRef(
tensor_Bs[idx], problem_size, "b", transpose(self.operation.A.layout))
tensor_ref_C = getTensorRef(
tensor_Cs[idx], problem_size, "c", transpose(self.operation.C.layout))
tensor_ref_D_ref = getTensorRef(
tensor_D_refs[idx], problem_size, "d", transpose(self.operation.C.layout))
else:
tensor_ref_A = getTensorRef(
tensor_As[idx], problem_size, "a", self.operation.A.layout)
tensor_ref_B = getTensorRef(
tensor_Bs[idx], problem_size, "b", self.operation.B.layout)
tensor_ref_C = getTensorRef(
tensor_Cs[idx], problem_size, "c", self.operation.C.layout)
tensor_ref_D_ref = getTensorRef(
tensor_D_refs[idx], problem_size, "d", self.operation.C.layout)
tensor_view_D_ref = getTensorView(
tensor_D_refs[idx], problem_size, "d", self.operation.C.layout)
cutlass.test.gemm.host.gemm(problem_size, alpha, tensor_ref_A,
tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
tensor_view_D = getTensorView(
tensor_Ds[idx], problem_size, "d", self.operation.C.layout)
passed = cutlass.test.gemm.host.equals(
tensor_view_D, tensor_view_D_ref)
try:
assert passed
except AssertionError:
self.print_problem_size(problem_size)
del arguments
assert get_allocated_size(
) == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
return passed

View File

@ -1,594 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from time import sleep
import pycutlass
from pycutlass import *
import pycutlass.utils.datatypes as datatypes
import cutlass
from cuda import cudart
from cuda import cuda
from bfloat16 import bfloat16
from .profiler import GpuTimer
import subprocess
def transpose(layout):
if layout == cutlass.RowMajor:
return cutlass.ColumnMajor
elif layout == cutlass.ColumnMajor:
return cutlass.RowMajor
elif layout == cutlass.ColumnMajorInterleaved32:
return cutlass.RowMajorInterleaved32
elif layout == cutlass.RowMajorInterleaved32:
return cutlass.ColumnMajorInterleaved32
def getTensorRef(tensor: np.ndarray, problem_size: cutlass.gemm.GemmCoord, operand: str, layout: cutlass.layout, batch_offset: int = 0):
ptr = tensor.__array_interface__['data'][0]
if operand == "a":
tensor_coord = problem_size.mk()
batch_stride = problem_size.m() * problem_size.k()
elif operand == "b":
tensor_coord = problem_size.kn()
batch_stride = problem_size.k() * problem_size.n()
elif operand in ["c", "d"]:
tensor_coord = problem_size.mn()
batch_stride = problem_size.m() * problem_size.n()
else:
raise ValueError("Unknown operand: " + operand)
elt_size = DataTypeSizeBytes[datatypes.to_cutlass(tensor.dtype)]
ptr += batch_offset * batch_stride * elt_size
if layout == cutlass.RowMajor:
layout = cutlass.RowMajor.packed(tensor_coord)
layout_tag = "RowMajor"
elif layout == cutlass.ColumnMajor:
layout = cutlass.ColumnMajor.packed(tensor_coord)
layout_tag = "ColumnMajor"
elif layout == cutlass.ColumnMajorInterleaved32:
layout = cutlass.ColumnMajorInterleaved32.packed(tensor_coord)
layout_tag = "ColumnMajorInterleaved32"
elif layout == cutlass.RowMajorInterleaved32:
layout = cutlass.RowMajorInterleaved32.packed(tensor_coord)
layout_tag = "RowMajorInterleaved32"
else:
raise ValueError("unsupported layout")
if tensor.dtype == np.float32:
ref_name = "TensorRefF32" + layout_tag
elif tensor.dtype == np.float64:
ref_name = "TensorRefF64" + layout_tag
elif tensor.dtype == np.float16:
ref_name = "TensorRefF16" + layout_tag
elif tensor.dtype == bfloat16:
ref_name = "TensorRefBF16" + layout_tag
elif tensor.dtype == np.int8:
ref_name = "TensorRefS8" + layout_tag
elif tensor.dtype == np.int32:
ref_name = "TensorRefS32" + layout_tag
else:
raise ValueError("unsupported datatype %s" %
ShortDataTypeNames[tensor.dtype])
return getattr(cutlass, ref_name)(ptr, layout)
def getTensorView(tensor: np.ndarray, problem_size: cutlass.gemm.GemmCoord, operand: str, layout: str, batch_offset: int = 0):
tensor_ref = getTensorRef(tensor, problem_size, operand, layout, batch_offset)
if operand == "a":
tensor_coord = problem_size.mk()
elif operand == "b":
tensor_coord = problem_size.kn()
elif operand in ["c", "d"]:
tensor_coord = problem_size.mn()
else:
raise ValueError("Unknown operand: " + operand)
if layout == cutlass.RowMajor:
layout_tag = "RowMajor"
elif layout == cutlass.ColumnMajor:
layout_tag = "ColumnMajor"
elif layout == cutlass.ColumnMajorInterleaved32:
layout_tag = "ColumnMajorInterleaved32"
elif layout == cutlass.RowMajorInterleaved32:
layout_tag = "RowMajorInterleaved32"
else:
raise ValueError("unsupported layout")
if tensor.dtype == np.float32:
ref_name = "TensorViewF32" + layout_tag
elif tensor.dtype == np.float64:
ref_name = "TensorViewF64" + layout_tag
elif tensor.dtype == np.float16:
ref_name = "TensorViewF16" + layout_tag
elif tensor.dtype == bfloat16:
ref_name = "TensorViewBF16" + layout_tag
elif tensor.dtype == np.int32:
ref_name = "TensorViewS32" + layout_tag
elif tensor.dtype == np.int8:
ref_name = "TensorViewS8" + layout_tag
else:
raise ValueError("unsupported datatype")
return getattr(cutlass, ref_name)(tensor_ref, tensor_coord)
class GemmUniversalLauncher:
def __init__(self, operation: 'GemmOperationUniversal', seed: int = 2080, interleaved=False,
verification=True, profiling=False, warmup_iterations=500, iterations=500, **kwargs) -> None:
# create the reduction kernel
self.reduction_operation: ReductionOperation = ReductionOperation(
shape=cutlass.MatrixCoord(4, 32 * operation.C.alignment),
C=operation.C, element_accumulator=operation.tile_description.math_instruction.element_accumulator,
element_compute=operation.epilogue_functor.element_epilogue, epilogue_functor=operation.epilogue_functor,
count=operation.C.alignment
)
self.math_operation = operation.tile_description.math_instruction.math_operation
#: verify the output result
self.verification = verification
#: profile the kernel's runtime
self.profiling = profiling
self.timer = GpuTimer()
self.warmup_iterations = warmup_iterations
self.iterations = iterations
if "sleep" in kwargs.keys():
self.sleep_time = kwargs["sleep"]
else:
self.sleep_time = 0
#
# Compile the operator
#
op_list = [operation]
if operation.arch < 90:
# Split K via Python is currently only supported for pre-SM90 kernels
op_list.append(self.reduction_operation)
pycutlass.compiler.add_module(op_list)
self.operation = operation
self.dtype_A = GemmUniversalLauncher.numpy_type(operation.A.element)
self.dtype_B = GemmUniversalLauncher.numpy_type(operation.B.element)
self.dtype_C = GemmUniversalLauncher.numpy_type(operation.C.element)
self.dtype_D = GemmUniversalLauncher.numpy_type(operation.C.element)
accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
element_size = DataTypeSize[operation.A.element]
if element_size == 1:
self.scope_max = 1
self.scope_min = 0
elif element_size <= 8:
self.scope_max = 1
self.scope_min = -1
elif element_size == 16:
self.scope_max = 4
self.scope_min = -4
else:
self.scope_max = 8
self.scope_min = -8
#: seed
self.seed: int = seed
#: whether the layout is interleaved
self.interleaved = interleaved
#: compute type
self.compute_type = operation.epilogue_functor.element_epilogue
self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
def print_problem_size(self, p, mode, batch_count):
if mode == cutlass.gemm.Mode.Gemm:
mode = "Gemm"
elif mode == cutlass.gemm.Mode.Batched:
mode = "GemmBatched"
elif mode == cutlass.gemm.Mode.GemmSplitKParallel:
mode = "GemmSplitKParallel"
problem_size = "problem: %d, %d, %d\n batch_count: %d\n mode: %s" % (
p.m(), p.n(), p.k(), batch_count, mode)
print(problem_size)
@staticmethod
def numpy_type(type):
if type == cutlass.float64:
return np.float64
elif type == cutlass.float32:
return np.float32
elif type == cutlass.float16:
return np.float16
elif type == cutlass.bfloat16:
return bfloat16
elif type == cutlass.int32:
return np.int32
elif type == cutlass.int8:
return np.int8
else:
raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
def uniform_init(self, size, dtype):
if dtype in [np.float32, np.float16, bfloat16, np.float64]:
return np.ceil(
np.random.uniform(
low=self.scope_min - 0.5, high=self.scope_max - 0.5,
size=size).astype(dtype)
)
else:
return np.random.uniform(
low=self.scope_min - 1, high=self.scope_max + 1,
size=size).astype(dtype)
def reorder_tensor_B(self, tensor_B, problem_size):
reordered_tensor_B = np.empty_like(tensor_B)
tensor_ref_B = getTensorRef(
tensor_B, problem_size, "b", self.operation.B.layout)
reordered_tensor_ref_B = getTensorRef(
reordered_tensor_B, problem_size, "b", self.operation.B.layout)
cutlass.gemm.host.reorder_column(
tensor_ref_B, reordered_tensor_ref_B, problem_size)
return reordered_tensor_B
def host_reference(self, problem_size, batch_count, tensor_A, tensor_B, tensor_C, alpha, beta):
tensor_D_ref = np.ones_like(tensor_C)
alpha = self.numpy_type(self.compute_type)(alpha)
beta = self.numpy_type(self.compute_type)(beta)
init_acc = 0
alpha = self.compute_type(alpha).value()
beta = self.compute_type(beta).value()
init_acc = self.accumulator_type(init_acc).value()
for i in range(batch_count):
if self.operation.switched:
tensor_ref_A = getTensorRef(
tensor_A, problem_size, "a", transpose(self.operation.B.layout), batch_offset=i)
tensor_ref_B = getTensorRef(
tensor_B, problem_size, "b", transpose(self.operation.A.layout), batch_offset=i)
tensor_ref_C = getTensorRef(
tensor_C, problem_size, "c", transpose(self.operation.C.layout), batch_offset=i)
tensor_ref_D_ref = getTensorRef(
tensor_D_ref, problem_size, "d", transpose(self.operation.C.layout), batch_offset=i)
else:
tensor_ref_A = getTensorRef(
tensor_A, problem_size, "a", self.operation.A.layout, batch_offset=i)
tensor_ref_B = getTensorRef(
tensor_B, problem_size, "b", self.operation.B.layout, batch_offset=i)
tensor_ref_C = getTensorRef(
tensor_C, problem_size, "c", self.operation.C.layout, batch_offset=i)
tensor_ref_D_ref = getTensorRef(
tensor_D_ref, problem_size, "d", self.operation.C.layout, batch_offset=i)
if self.math_operation in [MathOperation.multiply_add_saturate]:
cutlass.test.gemm.host.gemm_saturate(
problem_size, alpha, tensor_ref_A, tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
else:
cutlass.test.gemm.host.gemm(problem_size, alpha, tensor_ref_A,
tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
return tensor_D_ref
def equal(self, tensor_D, tensor_D_ref, problem_size, batch_count):
for i in range(batch_count):
tensor_view_D = getTensorView(
tensor_D, problem_size, "d", self.operation.C.layout, batch_offset=i)
tensor_view_D_ref = getTensorView(
tensor_D_ref, problem_size, "d", self.operation.C.layout, batch_offset=i)
if not cutlass.test.gemm.host.equals(tensor_view_D, tensor_view_D_ref):
return False
return True
def bytes(self, problem_size, batch_count=1, alpha=1.0, beta=0.0):
m = problem_size.m()
n = problem_size.n()
k = problem_size.k()
bytes = \
(DataTypeSize[self.operation.A.element] * m // 8) * k + \
(DataTypeSize[self.operation.B.element] * n // 8) * k + \
(DataTypeSize[self.operation.C.element] * m // 8) * n
if beta != 0:
bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
bytes *= batch_count
return bytes
def flops(self, problem_size, batch_count=1):
m = problem_size.m()
n = problem_size.n()
k = problem_size.k()
flops_ = (m * n * k) * 2 * batch_count
return flops_
def run_cutlass_profiler(self, mode, problem_size, batch_count=1, alpha=1.0, beta=0.0):
cutlass_path = os.getenv('CUTLASS_PATH')
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
values = {
"profiler_path": cutlass_path + "/build/tools/profiler/cutlass_profiler",
"kernel_name": self.operation.procedural_name(),
"verification_providers": "device",
"provider": "cutlass",
"m": str(problem_size.m()),
"n": str(problem_size.n()),
"k": str(problem_size.k()),
'split_k_slices': str(batch_count),
'alpha': str(alpha),
'beta': str(beta),
'warmup': str(self.warmup_iterations),
'profile': str(self.iterations)
}
cmd_template = \
"${profiler_path} --kernels=${kernel_name} --verification-providers=${verification_providers}" \
" --providers=${provider} --m=${m} --n=${n} --k=${k}"
cmd = SubstituteTemplate(cmd_template, values)
result = subprocess.getoutput(cmd)
m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
runtime = float(m.group('runtime'))
m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
bytes = int(m.group('bytes'))
m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
flops = int(m.group('flops'))
# check if the problem size matches
assert bytes == self.bytes(problem_size, alpha, beta)
assert flops == self.flops(problem_size)
return runtime
def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
assert get_allocated_size(
) == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
np.random.seed(self.seed)
# Assign an actual batch count in cases where we are not running in batched mode.
# This is to differentiate between the number of split K slices and the batch count,
# which are overloaded within the single `batch_count` variable.
true_batch_count = batch_count if mode == cutlass.gemm.Mode.Batched else 1
tensor_A = self.uniform_init(
size=(problem_size.m() * problem_size.k() * true_batch_count,), dtype=self.dtype_A)
tensor_B = self.uniform_init(
size=(problem_size.n() * problem_size.k() * true_batch_count,), dtype=self.dtype_B)
tensor_C = self.uniform_init(
size=(problem_size.m() * problem_size.n() * true_batch_count,), dtype=self.dtype_C)
tensor_D = np.zeros(
shape=(problem_size.m() * problem_size.n() * true_batch_count,), dtype=self.dtype_D)
#
# Launch kernel
#
arguments = GemmArguments(
operation=self.operation, problem_size=problem_size,
A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
output_op=self.operation.epilogue_type(alpha, beta),
gemm_mode=mode, split_k_slices=split_k_slices, batch=batch_count
)
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
reduction_arguments = ReductionArguments(
self.reduction_operation, problem_size=[
problem_size.m(), problem_size.n()],
partitions=split_k_slices,
workspace=arguments.ptr_D,
destination=tensor_D,
source=tensor_C,
output_op=self.reduction_operation.epilogue_type(alpha, beta)
)
self.operation.run(arguments)
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
self.reduction_operation.run(reduction_arguments)
passed = True
if self.verification:
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
reduction_arguments.sync()
else:
arguments.sync()
tensor_D_ref = self.host_reference(
problem_size, true_batch_count, tensor_A, tensor_B, tensor_C, alpha, beta)
passed = self.equal(tensor_D, tensor_D_ref, problem_size, true_batch_count)
try:
assert passed
except AssertionError:
self.print_problem_size(problem_size, mode, batch_count)
if self.profiling:
sleep(self.sleep_time)
for _ in range(self.warmup_iterations):
self.operation.run(arguments)
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
self.reduction_operation.run(reduction_arguments)
self.timer.start()
for _ in range(self.iterations):
self.operation.run(arguments)
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
self.reduction_operation.run(reduction_arguments)
self.timer.stop_and_wait()
runtime = self.timer.duration(self.iterations)
# free memory and clear buffers
del arguments
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
del reduction_arguments
assert get_allocated_size(
) == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
if self.profiling:
return runtime
return passed
def test_all_gemm(operation: 'GemmOperationUniversal', testcase="universal"):
passed = True
minimum_operand_element_size = min(
DataTypeSize[operation.A.element], DataTypeSize[operation.B.element])
opcode_class = operation.tile_description.math_instruction.opcode_class
if opcode_class == cutlass.OpClass.Simt:
alignment = 1
else:
alignment = 128 // minimum_operand_element_size
# int8_t gemm alignment constraints
if opcode_class == cutlass.OpClass.Simt and operation.A.element == cutlass.int8 and operation.A.layout == cutlass.ColumnMajor:
alignment_m = 4
else:
alignment_m = alignment
if opcode_class == cutlass.OpClass.Simt and operation.B.element == cutlass.int8 and operation.A.layout == cutlass.RowMajor:
alignment_n = 4
else:
alignment_n = alignment
if opcode_class == cutlass.OpClass.Simt and operation.A.element == cutlass.int8 \
and operation.B.element == cutlass.int8 \
and (operation.A.layout == cutlass.RowMajor or operation.B.layout == cutlass.ColumnMajor):
alignment_k = 4
else:
alignment_k = alignment
threadblock_k = operation.tile_description.threadblock_shape[2]
if testcase == "interleaved":
if operation.A.layout in [cutlass.ColumnMajorInterleaved32, cutlass.RowMajorInterleaved32]:
interleavedk = 32
else:
raise ValueError("Unknown layout")
if testcase == "interleaved":
modes = [cutlass.gemm.Mode.Gemm, ]
problem_size_m = [interleavedk, 512+interleavedk]
problem_size_n = [interleavedk, 512+interleavedk]
problem_size_k = [interleavedk, threadblock_k *
operation.tile_description.stages + interleavedk]
problem_alpha = [1.0]
problem_beta = [0.0]
batch_counts = [1, ]
elif testcase == "multistage":
modes = [cutlass.gemm.Mode.Gemm, ]
problem_size_m = [16, 528]
problem_size_n = [16, 528]
problem_size_k = [threadblock_k, threadblock_k * operation.tile_description.stages +
operation.tile_description.math_instruction.instruction_shape[2]]
problem_alpha = [1.0]
problem_beta = [0.0]
batch_counts = [1, ]
else: # universal
modes = [cutlass.gemm.Mode.Gemm]
batch_counts = [1, 2, 3, 5, 7]
if operation.arch < 90:
# Split K kernels via Python are currently only supported pre-SM90
modes.append(cutlass.gemm.Mode.GemmSplitKParallel)
problem_size_m = [alignment_m, 512 - 3 * alignment_m]
problem_size_n = [alignment_n, 512 - 2 * alignment_n]
if operation.tile_description.stages is None:
stages_for_k_calc = 7
else:
stages_for_k_calc = operation.tile_description.stages
problem_size_k = [
alignment_k,
threadblock_k * stages_for_k_calc - alignment_k,
threadblock_k * stages_for_k_calc * 3 - alignment_k]
problem_alpha = [1.0]
problem_beta = [2.0]
testbed = GemmUniversalLauncher(
operation, interleaved=(testcase == "interleaved"))
for mode in modes:
for m in problem_size_m:
for n in problem_size_n:
for k in problem_size_k:
for batch_count in batch_counts:
for alpha in problem_alpha:
for beta in problem_beta:
# skip very small K problems
if testcase == "universal":
if (k // batch_count < 2 * threadblock_k):
continue
problem_size = cutlass.gemm.GemmCoord(m, n, k)
if operation.arch < 90:
split_k_slices = batch_count
else:
split_k_slices = 1
overridden_mode = mode
if mode == cutlass.gemm.Mode.Gemm and batch_count > 1:
overridden_mode = cutlass.gemm.Mode.Batched
passed = testbed.run(
overridden_mode, problem_size, batch_count, split_k_slices, alpha, beta)
err, = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(
"CUDA Error %s" % str(err))
if not passed:
return False
return passed

View File

@ -1,70 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from cuda import cuda
from cuda import cudart
class GpuTimer:
def __init__(self) -> None:
self.events = [
cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1]
]
def start(self, stream=cuda.CUstream(0)):
err, = cuda.cuEventRecord(self.events[0], stream)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
def stop(self, stream=cuda.CUstream(0)):
err, = cuda.cuEventRecord(self.events[1], stream)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
pass
def stop_and_wait(self, stream=cuda.CUstream(0)):
self.stop(stream)
if stream:
err, = cuda.cuStreamSynchronize(stream)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
else:
err, = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
def duration(self, iterations=1):
err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError("CUDA Error %s" % str(err))
return duration / float(iterations)

View File

@ -1,109 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import cutlass
from pycutlass import library, SubstituteTemplate
class Layout:
"""
Utility class to map transpose and non-transpose terminology to row- and column-major terminology
"""
T = cutlass.RowMajor
N = cutlass.ColumnMajor
class LayoutCombination:
"""
Utility class defining all combinations of row- and column-major layouts for operands to a GEMMs
"""
NNN = (Layout.N, Layout.N, Layout.N)
NNT = (Layout.N, Layout.N, Layout.T)
NTN = (Layout.N, Layout.T, Layout.N)
NTT = (Layout.N, Layout.T, Layout.T)
TNN = (Layout.T, Layout.N, Layout.N)
TNT = (Layout.T, Layout.N, Layout.T)
TTN = (Layout.T, Layout.T, Layout.N)
TTT = (Layout.T, Layout.T, Layout.T)
def get_name(layouts, alignments, element_output,
element_accumulator, element_epilogue, cluster_shape,
threadblock_shape, stages, element_a, element_b, arch, opclass, suffix=""):
"""
Generates a procedural name for a test case.
:param layouts: indexable container of layouts of A, B, and C operands
:param alignments: indexable container of alignments of A, B, and C operands
:param element_output: data type of the output element
:param element_accumulator: data type used in accumulation
:param element_epilogue: data type used in computing the epilogue
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
:param threadblock_shape: indexable container of dimensions of threadblock tiles
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param element_a: data type of operand A
:param element_b: data type of operand B
:param arch: compute capability of kernel being generated
:type arch: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpClass
:param suffix: additional string to add to the suffix of the name
:type suffix: str
:return: str
"""
name_format = 'test_SM${arch}_Device_Gemm_${eA}${lA}_${eB}${lB}_${eC}${lC}_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${cM}x${cN}x${cK}_${stages}_align${aA}-${aB}-${aC}${suffix}'
return SubstituteTemplate(name_format,
{
'arch': str(arch),
'eA': library.DataTypeNames[element_a],
'eB': library.DataTypeNames[element_b],
'eC': library.DataTypeNames[element_output],
'lA': library.ShortLayoutTypeNames[layouts[0]],
'lB': library.ShortLayoutTypeNames[layouts[1]],
'lC': library.ShortLayoutTypeNames[layouts[2]],
'opclass': library.OpcodeClassNames[opclass],
'acc': library.DataTypeNames[element_accumulator],
'cM': str(cluster_shape[0]),
'cN': str(cluster_shape[1]),
'cK': str(cluster_shape[2]),
'tbM': str(threadblock_shape[0]),
'tbN': str(threadblock_shape[1]),
'tbK': str(threadblock_shape[2]),
'stages': str(stages) if stages is not None else 'auto',
'aA' : str(alignments[0]),
'aB' : str(alignments[1]),
'aC' : str(alignments[2]),
'suffix': '' if suffix is None else suffix
}
)

View File

@ -1,39 +0,0 @@
################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
from typing import Union
from typeguard import typechecked
GemmOperation = 'Union[GemmOperationUniversal, GemmOperationGrouped]'
Tensor = 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]'

View File

@ -1 +0,0 @@
from pycutlass.utils.reference_model import *

View File

@ -1,121 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Utility functions for converting between frontend datatypes and CUTLASS datatypes
"""
from typing import Union, Tuple
import cutlass
import pycutlass.library as library
try:
import numpy as np
numpy_available = True
except ImportError:
numpy_available = False
def numpy_to_cutlass(inp):
if numpy_available:
if inp == np.float16:
return cutlass.float16
elif inp == np.float32:
return cutlass.float32
elif inp == np.float64:
return cutlass.float64
elif inp == np.int8:
return cutlass.int8
elif inp == np.int32:
return cutlass.int32
return None
try:
import cupy as cp
cupy_available = True
cupy_to_cutlass_dict = {
cp.float16: cutlass.float16,
cp.float32: cutlass.float32,
cp.float64: cutlass.float64
}
except ImportError:
cupy_available = False
def cupy_to_cutlass(inp):
if cupy_available:
if inp == cp.float16:
return cutlass.float16
elif inp == cp.float32:
return cutlass.float32
elif inp == cp.float64:
return cutlass.float64
return None
try:
import torch
torch_available = True
torch_to_cutlass_dict = {
torch.half: cutlass.float16,
torch.float16: cutlass.float16,
torch.float: cutlass.float32,
torch.float32: cutlass.float32,
torch.double: cutlass.float64,
torch.float64: cutlass.float64
}
except ImportError:
torch_available = False
def torch_to_cutlass(inp):
if torch_available:
return torch_to_cutlass_dict.get(inp, None)
try:
import bfloat16
bfloat16_available = True
except ImportError:
bfloat16_available = False
def bfloat16_to_cutlass(inp):
if bfloat16_available:
if inp == bfloat16.bfloat16:
return cutlass.bfloat16
def to_cutlass(inp):
for cvt_fn in [bfloat16_to_cutlass, cupy_to_cutlass, numpy_to_cutlass, torch_to_cutlass]:
out = cvt_fn(inp)
if out is not None:
return out
raise Exception('No available conversion from type {} to a CUTLASS type.'.format(inp))

View File

@ -1,76 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Utility functions for interacting with the device
"""
from cuda import cudart
def check_cuda_errors(result: list):
"""
Checks whether `result` contains a CUDA error raises the error as an exception, if so. Otherwise,
returns the result contained in the remaining fields of `result`.
:param result: the results of the `cudart` method, consisting of an error code and any method results
:type result: list
:return: non-error-code results from the `results` parameter
"""
# `result` is of the format : (cudaError_t, result...)
err = result[0]
if err.value:
raise RuntimeError("CUDA error: {}".format(cudart.cudaGetErrorName(err)))
if len(result) == 1:
return None
elif len(result) == 2:
return result[1]
else:
return result[1:]
def device_cc(device: int = 0) -> int:
"""
Returns the compute capability of the device with ID `device`.
:param device: ID of the device to query
:type device: int
:return: compute capability of the queried device (e.g., 80 for SM80)
:rtype: int
"""
deviceProp = check_cuda_errors(cudart.cudaGetDeviceProperties(device))
major = str(deviceProp.major)
minor = str(deviceProp.minor)
return int(major + minor)

View File

@ -1,255 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import numpy as np
import cutlass
from pycutlass.library import TensorDescription
from typing import Union
from bfloat16 import bfloat16
try:
import torch
torch_available = True
except ImportError:
torch_available = False
class ReferenceModule:
def __init__(self, A: TensorDescription, B: TensorDescription, C: TensorDescription) -> None:
self.layout_A = A.layout
self.layout_B = B.layout
self.layout_C = C.layout
def run(self, A: np.ndarray, B: np.ndarray, C: np.ndarray, problem_size: cutlass.gemm.GemmCoord, alpha: float=1.0, beta: float=0.0, bias=False, batch=1):
"""
Compute the reference result on CPU
Args:
A: dense operator with shape (M, K) in row-major and (K, M) in column-major
B: dense operator with shape (K, N) in row-major and (N, K) in column-major
C: dense operator with shape (M, N) in row-major and (N, M) in column-major
"""
M, N, K = problem_size.m(), problem_size.n(), problem_size.k()
if isinstance(A, np.ndarray):
if self.layout_A == cutlass.RowMajor:
A_row = np.reshape(A, newshape=(batch, M, K))
else:
A_col = np.reshape(A, newshape=(batch, K, M))
A_row = np.transpose(A_col, axes=(0, 2, 1))
if self.layout_B == cutlass.RowMajor:
B_row = np.reshape(B, newshape=(batch, K, N))
else:
B_col = np.reshape(B, newshape=(batch, N, K))
B_row = np.transpose(B_col, axes=(0, 2, 1))
if self.layout_C == cutlass.RowMajor:
if bias:
C_row = np.reshape(C, newshape=(batch, 1, N))
else:
C_row = np.reshape(C, newshape=(batch, M, N))
else:
if bias:
C_row = np.reshape(C, newshape=(batch, M, 1))
else:
C_col = np.reshape(C, newshape=(batch, N, M))
C_row = np.transpose(C_col, axes=(0, 2, 1))
if A_row.dtype == bfloat16:
# numpy's einsum doesn't support bfloat16
out_row = np.einsum("bik,bkj->bij", A_row.astype(np.float32), B_row.astype(np.float32)) * alpha + C_row * beta
out_row = out_row.astype(C_row.dtype)
else:
out_row = np.einsum("bik,bkj->bij", A_row, B_row) * alpha + C_row * beta
if self.layout_C == cutlass.ColumnMajor:
out = np.transpose(out_row, axes=(0, 2, 1))
else:
out = out_row
return out.ravel()
elif isinstance(A, torch.Tensor):
if self.layout_A == cutlass.RowMajor:
A_row = A.view((M, K))
else:
A_col = A.view((K, M))
A_row = torch.permute(A_col, (1, 0))
if self.layout_B == cutlass.RowMajor:
B_row = B.view((K, N))
else:
B_col = B.view((N, K))
B_row = torch.permute(B_col, (1, 0))
if self.layout_C == cutlass.RowMajor:
C_row = C.view((M, N))
else:
C_col = C.view((N, M))
C_row = torch.permute(C_col, (1, 0))
out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
if self.layout_C == cutlass.ColumnMajor:
out = torch.permute(out_row, (1, 0))
else:
out = out_row
return torch.flatten(out)
#####################################################################################################
# Conv2d
#####################################################################################################
if torch_available:
class Conv2dReferenceModule:
def __init__(self, A: TensorDescription, B: TensorDescription, C: TensorDescription, kind: cutlass.conv.Operator.fprop) -> None:
self.layout_A = A.layout
self.layout_B = B.layout
self.layout_C = C.layout
self.kind = kind
def run(self,
A: Union[np.ndarray, torch.Tensor],
B: Union[np.ndarray, torch.Tensor],
C: Union[np.ndarray, torch.Tensor], problem_size, alpha=1.0, beta=0.0, bias=False) -> np.ndarray:
"""
Compute the reference result on CPU
"""
n = problem_size.N
h = problem_size.H
w = problem_size.W
c = problem_size.C
k = problem_size.K
r = problem_size.R
s = problem_size.S
p = problem_size.P
q = problem_size.Q
stride_h = problem_size.stride_h
stride_w = problem_size.stride_w
pad_h = problem_size.pad_h
pad_w = problem_size.pad_w
dilation_h = problem_size.dilation_h
dilation_w = problem_size.dilation_w
groups = problem_size.groups
if isinstance(A, np.ndarray):
# the pytorch activation layout is NCHW
# weight layout is Cout Cin Kh Kw (also NCHW)
if self.layout_A == cutlass.TensorNHWC:
A_nhwc = np.reshape(A, newshape=(n, h, w, c))
A_torch_nhwc = torch.from_numpy(A_nhwc).to("cuda")
A_torch_nchw = torch.permute(A_torch_nhwc, (0, 3, 1, 2))
if self.layout_B == cutlass.TensorNHWC:
B_nhwc = np.reshape(B, newshape=(k, r, s, c))
B_torch_nhwc = torch.from_numpy(B_nhwc).to("cuda")
B_torch_nchw = torch.permute(B_torch_nhwc, (0, 3, 1, 2))
if self.layout_C == cutlass.TensorNHWC:
C_nhwc = np.reshape(C, newshape=(n, p, q, k))
C_torch_nhwc = torch.from_numpy(C_nhwc).to("cuda")
C_torch_nchw = torch.permute(C_torch_nhwc, (0, 3, 1, 2))
elif isinstance(A, torch.Tensor):
if self.kind == cutlass.conv.Operator.wgrad:
if self.layout_A == cutlass.TensorNHWC:
A_nhwc = A.view((n, p, q, k))
A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
if self.layout_B == cutlass.TensorNHWC:
B_nhwc = B.view((n, h, w, c))
B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
if self.layout_C == cutlass.TensorNHWC:
if bias:
C_nhwc = C.view((1, 1, 1, c))
else:
C_nhwc = C.view((k, r, s, c))
C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
elif self.kind == cutlass.conv.Operator.dgrad:
if self.layout_A == cutlass.TensorNHWC:
A_nhwc = A.view((n, p, q, k))
A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
if self.layout_B == cutlass.TensorNHWC:
B_nhwc = B.view((k, r, s, c))
B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
if self.layout_C == cutlass.TensorNHWC:
if bias:
C_nhwc = C.view((1, 1, 1, c))
else:
C_nhwc = C.view((n, h, w, c))
C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
else:
if self.layout_A == cutlass.TensorNHWC:
A_nhwc = A.view((n, h, w, c))
A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
if self.layout_B == cutlass.TensorNHWC:
B_nhwc = B.view((k, r, s, c))
B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
if self.layout_C == cutlass.TensorNHWC:
if bias:
C_nhwc = C.view((1, 1, 1, k))
else:
C_nhwc = C.view((n, p, q, k))
C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
if self.kind == cutlass.conv.Operator.fprop:
D_torch_nchw = alpha * torch.nn.functional.conv2d(
A_torch_nchw, B_torch_nchw, stride=(stride_h, stride_w),
padding=(pad_h, pad_w), dilation=(dilation_h, dilation_w), groups=groups) + beta * C_torch_nchw
elif self.kind == cutlass.conv.Operator.dgrad:
D_torch_nchw = alpha * torch.nn.grad.conv2d_input(
(n, c, h, w), B_torch_nchw, A_torch_nchw, padding=(pad_h, pad_w), stride=(stride_h, stride_w)
).to(torch.float32) + beta * C_torch_nchw
elif self.kind == cutlass.conv.Operator.wgrad:
D_torch_nchw = alpha * torch.nn.grad.conv2d_weight(
B_torch_nchw, (k, c, r, s), A_torch_nchw, padding=(pad_h, pad_w), stride=(stride_h, stride_w)
).to(torch.float32) + beta * C_torch_nchw
if self.layout_C == cutlass.TensorNHWC:
if isinstance(A, np.ndarray):
D_torch_out = torch.permute(D_torch_nchw, (0, 2, 3, 1)).detach().cpu().numpy()
elif isinstance(A, torch.Tensor):
D_torch_out = torch.permute(D_torch_nchw, (0, 2, 3, 1))
return D_torch_out.flatten()

View File

@ -1,274 +0,0 @@
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 2104699940 3506659864 557648934
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1539314507 3971227455 1976927351 1642148785
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 276489656 653235219 3147305346 880610205
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 272457724 2178229139 2786201726 4170295839
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 242235041 2149454506 784935854 682531065
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 3478189705 1667216236 1437761176
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 379326961 1780379994 3740415776
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 924848818 3533854396 2683779476
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 359232443 2147867990 1653277018
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 3784314846 2644315999 4224154526
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3787448414 3562991793 535073859 2563373454
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 426169840 2464808416 864648234 461884698
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2564934525 3910792915 3577331017 827498183
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 28479234 867695528 1947311971 83328334
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4192922822 4244595864 2296602326 2349214706
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 274678245 3464152269 1682550229 3446204619
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3993280136 828543035 1319748516 956044554
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 832003025 3799813757 4030292245 457791957
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1444316594 4129865888 93616503 412257611
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 36703874
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 1842147148
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1612565294 109894479 1782187316 3370789453
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 841569299 1010785577 1158956167 3261208135
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1893352157 48149942 3544807462 446577726
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 3585320147 2150950452 1625817025 3964129474
conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 2624928614 3423533117 3186342135
conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 2732296888 1838622641 4203745561
conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3456572634 893492926 1966259884
conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 4014726279 4027869577 1510990157
conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 4140605332 3580988556 3425909428
conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2106553169 835800311 3417471222
conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 860217059 166776702 1109666471
conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 855244826 2670006594 3857976152
conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 3079461262 3579256638 2926210806
conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2952423142 2045838875 3445165841
conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 2133381336 2601441527 2035094220
conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 1700915522 2515933441 406719240
conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 156533442 1012781676 688128904
conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930
conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 3612826298 2531545294 476754549
conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 2391975923 197605094 3409942185
conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3071904063 408984565 2378809888
conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 3067676760 1540919649 2008865071
conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 1085505037 2778215386 230227569
conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2731079464 3570839563 3483629877
conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 408419601 3415600242 2106927195
conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 3606099389 4034802752 3200055633
conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 3910244699 1319285699 2229775542
conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 2780071616 2703730845 3090625734
conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 4278696824 360883914 3802692600
conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 653419877 359675571 283806385
conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 1075980921 3101013494 2025203940
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1393431534 1148212814 1350914659
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 4283492776 419570292 1210341563
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4178596783 3828059710 2735749436 2671012171
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 924522595 563724475 3750778972 4152580670
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1021044158 1686067905 3765040166 4102272733
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 2674994719 635224486 2759329777
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 4201252830 2920298728 304256151
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 70289262 646435722 4137562540
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 1288095320 2132879813 656196754
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 2202157489 2326567490 2475188414
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2476454437 1857118302 4164386062 239840568
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2767650699 3514840131 590439733 3879821123
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3896287283 3112762669 2515107934 2106635937
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1903067870 1021832870 3003938078 2751931686
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3489785028 2466126497 1374078692 2737628040
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2051350923 263676708 3639860119 1370886256
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 719099834 1474713672 204857540 2768940347
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3441724486 3162593831 421721594 3097845598
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2034354027 1249407570 2567025479 1441082595
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 2369653089
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 1218705038
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 172579142 319546523 718795680 1453661415
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2823351660 1326352711 1110204809 1155441703
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3238446487 2572503545 686287700 1559476701
conv2d fprop_1x8x8x1_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1883874274 1180207512 3934800419
conv2d fprop_1x16x16x1_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 4230587034 4117433929 2540623821
conv2d fprop_1x16x16x1_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 3802993432 1563447158 515257167
conv2d fprop_1x224x224x1_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2583340103 3928463259 1564251818
conv2d fprop_1x224x224x1_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2966178620 3457283045 1726663817
conv2d fprop_1x224x224x1_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 3101289788 3492498648
conv2d fprop_1x224x224x1_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 498358130 4111289929
conv2d fprop_1x8x8x2_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2693144988 3876248534 3038023830 1910263513
conv2d fprop_1x16x16x2_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3355193355 319259163 535683577
conv2d fprop_1x16x16x2_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 1548147432 3385829172 2741952709
conv2d fprop_1x224x224x2_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 2686562907 3948710179 3669872932
conv2d fprop_1x224x224x2_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 576815792 2317227037 1211532666
conv2d fprop_1x224x224x2_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 555460201 895685163
conv2d fprop_1x224x224x2_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 1465341652 2228916523
conv2d fprop_1x8x8x4_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 137535877 1436667267 1395660627
conv2d fprop_1x224x224x4_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 2226159049 4051661898 209529384
conv2d fprop_1x224x224x4_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 3541851870 2271016226 2671623385
conv2d fprop_1x224x224x4_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 2007343215 3362992769
conv2d fprop_1x224x224x4_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 20610297 1086800078
conv2d fprop_1x8x8x8_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3117444553 1497663382 3561001103
conv2d fprop_1x224x224x8_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 1414143072 827338392 2827855918
conv2d fprop_1x224x224x8_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 3886996022 26545788 3407771964
conv2d fprop_1x224x224x8_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 2374613655 3601677176
conv2d fprop_1x224x224x8_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 778374730 2110111988
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1736512560 49406874 846358010 3314905564
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1848484956 1432417472 1903569827 3750799351
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4236427320 3696009469 69852620 201921851
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 109006944 450017448 1793784844 903209915
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 813367872 2397796503 1928191746 3210229460
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1307184141 46021356 1674017987
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1212511562 3331767121 2446286369
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 2013675943 1681111033 1469213228
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 500298386 3218034344 4159283207
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 1123534155 145385311 4273847179
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3862659311 349459322 1503631520 1404971956
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1623686755 961217371 552550209 3980749384
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3554927580 1131648083 4149599295 3119557776
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1767639287 3350675774 128324027 1059816532
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3986143536 17411088 40173029 1694092310
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1157793540 3513299281 48848814 1435528367
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 988962069 4292634763 388976034 2674929544
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4202383208 3529769234 1046186503 3368902675
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 856448884 3057259762 2063087558 1995545427
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 400986166
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 1082696406
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2702905851 1992889713 731289041 608504198
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2742293143 4197915274 606840 3671124731
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 149434841 2288560511 2994968424 2881838300
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 2226824643 327135318 3718671210 2121176659
conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3254575292 1119957081 672831271
conv2d fprop_1x4x4x14_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3115523958 3622905002 4020453928 3853387318
conv2d fprop_1x23x56x98_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1702870033 1876930844 1190400523 3937287850
conv2d fprop_1x4x4x28_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 2587856937 2021107274 2789519899
conv2d fprop_1x23x56x100_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2368669977 1353376771 744357395 786349633
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 991402150 1393431534 2496492611 3901723984
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4208297221 4283492776 3148637036 258220505
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4178596783 3828059710 281106520 1103939403
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 924522595 563724475 1938163814 2197809394
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1021044158 1686067905 350851834 3999808950
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 2674994719 1034822169 1611033520
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 4201252830 1597212204 2181492560
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 70289262 3001492060 1379239000
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 1288095320 4211138051 2804617605
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 2202157489 1043108884 2923122465
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2476454437 1857118302 3877008798 1206012078
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2767650699 3514840131 2946529611 3907056932
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3896287283 3112762669 1581171257 3959460786
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1903067870 1021832870 1926804094 1756790353
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3489785028 2466126497 1712378956 434322965
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2051350923 263676708 355203300 821870356
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 719099834 1474713672 2886387159 4086314983
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3441724486 3162593831 1422796372 2049419539
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2034354027 1249407570 1196036582 2684312264
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 1060050551
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 3361618746
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 172579142 319546523 2332616929 543467298
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2823351660 1326352711 3839068434 65031397
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3238446487 2572503545 3604065639 2111204111
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2149247508 1775375365 2663631601 1249487679
conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 403997062 1679063623 4062928786
conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 1623218578 436154205
conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1479940693 3253144559 3883419107
conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 1871463331 2425320272 74566211
conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 664160900 3610888033 22347127
conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1924855848 1382111427 2541177413
conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 1764715518 3070473696 2392864704
conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 666906244 3401957738 2050602745
conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1575210381 781892324 2848949054
conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 2316839359 1539389419 4293781748
conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 2693098375
conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 1969608051
conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 554790212 2885143346 780489333
conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 835105643 3337423971 3866137775
conv2d dgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1092015789 3160693693 1526395881
conv2d dgrad_1x56x56x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 2236679600 3168985259
conv2d dgrad_1x55x55x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 3784328837 471971363
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 1266976707 942688231 3457364823
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 2005082293 2235558527
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3380032042 1370040310 1348846927
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 1423304149 2107662762 1234913781
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 1709026638 2421185623 3308071321
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2519327328 2541413264 3185574975
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2070174510 1364436192 3531942595
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 2056902987 3079166829 2329433528
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3227877956 645422556
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3817218800 985231315
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 1398036015 3630062764 2492522537
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 643733019 3649549642 2637869234
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2332160299 302086821 3303132343
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 2458714707 2919710256 2311575036
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2260022344 500095455 2760458995
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 3635363851 2402907878 4131497953
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 2536338700 2459524764 2504484273
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 2667385029 2714805835 3487838445
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 1547169349 3198573835 302049294
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 1317923157
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 3186679687
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 4220759192 2236533218 3731336532
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 1591352238 1756650151 1262787222
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 892422645 1334708242 1372556938
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 150035460 2897171548 3701081496
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 4106152802 2634710231 744755886
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 2709881923 2407415563
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 3723472741 3733128758 3129111191
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 2042513140 253288229 404121198
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1116254439 525487530 3284739065
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1743485155 91136873 2508716910
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 386662952 1127709182 4026285141
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 3954249564 2591894666 2655687700
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1263618595 1313664339
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1756414462 2995557277
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 447261065 121940906 1497499264
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 2966693627 1423016429 341928547
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1759979610 2761559427 68093525
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 2980501720 1650970502 3258883197
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 3502822733 3985958544 2568949300
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 3289288595 385631111 328914986
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 3391080565 1513955316 1521294163
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1669352457 2608107448 4284090805
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 1126870455 106232038 3054809396
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 4239438967
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 2113601884
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 2413490039 36034283 1112346965
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 1601750164 14375779 2894970748
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 1300976652 4259930640 305685205
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 1747587481 4137156526 1174257270
conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1086820986 1644914756 2013471312
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 447674669 724481645 1457430910
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3401425854 3897766524
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3749787834 3350064812 1136116240
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 820341033 770836461 2451581199
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 2581696511 1088458082 1521190911
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2885454895 935600441 2615245898
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3831334389 3506139121 814982501
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 737968461 1291834254 2665225480
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 1809195644 1765637461
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 3379808294 483095299
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 4194153035 2863868771 1639389008
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 157618421 1779474147 814087242
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 2300180628 423968553 3890279569
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 1848932917 522753581 1926508271
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 3663040534 4014266327 1288646188
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 1585195072 1487505772 3253374264
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 451194147 3578359696 3659768981
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2780826684 2883769406 148530958
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 3849874822 102765469 1305171059
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1516344656
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1586331550
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 2274021368 1188866747 3178890497
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 1226457131 4187777346 1400559240
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 1585959358 3731079159 1498901684
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 2758666204 3287095476 4291916486
conv2d wgrad_1x8x8x1_8x8_1x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 4278264698 2331753571 2554564568
conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930

View File

@ -1,233 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
from pycutlass.conv2d_operation import *
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dDgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float16)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float16)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float16)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float16)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,209 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dDgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=4,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3_64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4_64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=4,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,130 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
import pycutlass
from pycutlass.conv2d_operation import *
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dDgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[4, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[2, 4, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,127 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dDgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float32, element_b=cutlass.float32,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float32,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,195 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
def conv2d_few_channel_problemsizes(channels):
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 8, 8, channels),
cutlass.Tensor4DCoord(16, 3, 3, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 16, 16, channels),
cutlass.Tensor4DCoord(16, 3, 3, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 16, 16, channels),
cutlass.Tensor4DCoord(16, 7, 7, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(32, 7, 7, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(64, 7, 7, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(64, 5, 5, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(64, 5, 5, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
return problem_sizes
class Conv2dFpropFewChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(2)))
def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_1(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=1)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=1)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=2,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(1)))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,219 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
import pycutlass
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
def conv2d_fixed_channel_problemsizes(channels):
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 8, 8, channels),
cutlass.Tensor4DCoord(16, 3, 3, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(32, 7, 7, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(64, 7, 7, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(64, 5, 5, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 224, 224, channels),
cutlass.Tensor4DCoord(64, 5, 5, channels),
cutlass.Tensor4DCoord(1, 1, 1, 1),
cutlass.MatrixCoord(2, 2),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
return problem_sizes
class Conv2dFpropFixedChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(8)))
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(4)))
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float32)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(2)))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,341 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
import pycutlass
from pycutlass import *
from pycutlass.test import *
from pycutlass.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dFpropImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float16)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float16)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float16)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 14),
cutlass.Tensor4DCoord(8, 3, 3, 14),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 23, 56, 98),
cutlass.Tensor4DCoord(128, 3, 3, 98),
cutlass.Tensor4DCoord(4, 0, 5, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float16)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 14),
cutlass.Tensor4DCoord(8, 3, 3, 14),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 23, 56, 98),
cutlass.Tensor4DCoord(128, 3, 3, 98),
cutlass.Tensor4DCoord(4, 0, 5, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass.float16, element_b=cutlass.float16,
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass.float16,
layout=cutlass.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass.float16)
operation = Conv2dOperation(
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass.IdentitySwizzle1
)
problem_sizes = [
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 12),
cutlass.Tensor4DCoord(8, 3, 3, 12),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 4, 4, 28),
cutlass.Tensor4DCoord(8, 3, 3, 28),
cutlass.Tensor4DCoord(0, 0, 0, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
cutlass.conv.Conv2dProblemSize(
cutlass.Tensor4DCoord(1, 23, 56, 100),
cutlass.Tensor4DCoord(128, 3, 3, 100),
cutlass.Tensor4DCoord(4, 0, 5, 0),
cutlass.MatrixCoord(3, 3),
cutlass.MatrixCoord(1, 1),
cutlass.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
pycutlass.get_memory_pool(2**26, 2**26)
unittest.main()

Some files were not shown because too many files have changed in this diff Show More