CUTLASS 3.2.1 (#1113)
* Updates for 3.2.1 release. * Minor fix in gemm op profiler for raster order. * Add scheduler mapping for raster order in the kernels.
This commit is contained in:
660
test/python/cutlass/conv2d/conv2d_problem_sizes.py
Normal file
660
test/python/cutlass/conv2d/conv2d_problem_sizes.py
Normal file
@ -0,0 +1,660 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Utilities for defining Conv2D problem sizes for testing.
|
||||
|
||||
This file was ported from the C++ version in test/unit/conv/device/conv2d_problems.h
|
||||
"""
|
||||
|
||||
import cutlass
|
||||
from cutlass import ConvMode
|
||||
from cutlass.shape import Conv2DProblemSize
|
||||
|
||||
|
||||
class TestbedConv2dProblemSizes:
|
||||
def __init__(self, minimum_channel_size: int):
|
||||
conv2d_default_sizes = self.initialize_conv2d_default_sizes(minimum_channel_size)
|
||||
conv2d_rigorous_sizes = self.initialize_conv2d_rigorous_sizes(minimum_channel_size)
|
||||
conv2d_resnet50_sizes = self.initialize_conv2d_resnet50_sizes(1)
|
||||
conv2d_resnet50_sizes_perf = self.initialize_conv2d_resnet50_sizes(34)
|
||||
grouped_sizes = self.initialize_conv2d_grouped_sizes()
|
||||
|
||||
# Filter all problems
|
||||
self.all = []
|
||||
for size_list in [conv2d_default_sizes, conv2d_rigorous_sizes, conv2d_resnet50_sizes, conv2d_resnet50_sizes_perf, grouped_sizes]:
|
||||
for size in size_list:
|
||||
if (size.C // size.groups) % minimum_channel_size == 0:
|
||||
self.all.append(size)
|
||||
|
||||
|
||||
def initialize_conv2d_default_sizes(self, minimum_channel_size):
|
||||
# Small input size x stride (1,1)
|
||||
# C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
|
||||
|
||||
conv2d_default_sizes = []
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 1, 1, minimum_channel_size,
|
||||
8, 1, 1, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 1, 8, minimum_channel_size,
|
||||
8, 1, 3, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 7, 8, minimum_channel_size,
|
||||
8, 3, 3, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 7, 9, minimum_channel_size,
|
||||
8, 4, 4, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
2, 7, 9, minimum_channel_size,
|
||||
8, 5, 5, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
3, 7, 9, minimum_channel_size,
|
||||
8, 6, 5, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
3, 7, 9, minimum_channel_size,
|
||||
8, 6, 6, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
3, 7, 9, minimum_channel_size,
|
||||
8, 7, 7, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##############################################
|
||||
# Small input size x stride (2,2)
|
||||
# C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
|
||||
##############################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 11, 7, minimum_channel_size,
|
||||
8, 1, 1, minimum_channel_size,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 11, 7, minimum_channel_size,
|
||||
8, 3, 3, minimum_channel_size,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 13, 11, minimum_channel_size,
|
||||
8, 1, 1, minimum_channel_size,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 17, 19, minimum_channel_size,
|
||||
16, 2, 2, minimum_channel_size,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 23, 5, minimum_channel_size,
|
||||
16, 3, 3, minimum_channel_size,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 13, 17, 8,
|
||||
24, 3, 3, 8,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 23, 21, 8,
|
||||
24, 3, 3, 8,
|
||||
1, 1,
|
||||
3, 3,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 20, 24, 8,
|
||||
40, 3, 3, 8,
|
||||
3, 3,
|
||||
3, 3,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##########################################
|
||||
# Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1)
|
||||
##########################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 15, 19, 160,
|
||||
224, 1, 1, 160,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 19, 37, 160,
|
||||
224, 3, 3, 160,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 16, 16, 160,
|
||||
224, 2, 3, 160,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 23, 21, 128,
|
||||
224, 3, 3, 128,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 29, 37, 160,
|
||||
224, 5, 5, 160,
|
||||
2, 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##########################################
|
||||
# C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
|
||||
##########################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 15, 19, 32 + minimum_channel_size,
|
||||
96, 3, 3, 32 + minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 16, 24, 64 + minimum_channel_size,
|
||||
96, 3, 3, 64 + minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##########################################
|
||||
# Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)
|
||||
##########################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 13, 16, 288,
|
||||
160, 5, 5, 288,
|
||||
2, 2,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 55, 51, 256,
|
||||
512, 1, 1, 256,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 71, 80, 32,
|
||||
64, 5, 5, 32,
|
||||
2, 2,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 224, 224, 8,
|
||||
64, 7, 7, 8,
|
||||
3, 3,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##########################################
|
||||
# Medium input size stride (3, 3), filter (3, 3), non-default padding
|
||||
##########################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 27, 23, 256,
|
||||
512, 3, 3, 256,
|
||||
0, 0,
|
||||
3, 3,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##########################################
|
||||
# Medium input size padding > stride, asymmetric filter, padding and striding
|
||||
##########################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 27, 31, 256,
|
||||
512, 3, 3, 256,
|
||||
5, 7,
|
||||
3, 4,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 27, 35, 256,
|
||||
512, 7, 5, 256,
|
||||
11, 7,
|
||||
3, 5,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##########################################
|
||||
# Medium input size *mixed* stride (1, 2) and (2, 1),
|
||||
# filter (3, 3), default padding
|
||||
##########################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 27, 27, 256,
|
||||
512, 3, 3, 256,
|
||||
1, 1,
|
||||
1, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 27, 27, 256,
|
||||
512, 3, 3, 256,
|
||||
1, 1,
|
||||
2, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
######################################/
|
||||
# Additional input size
|
||||
######################################/
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
3, 28, 28, 256,
|
||||
256, 2, 2, 256,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 32, 32, 16,
|
||||
32, 3, 3, 16,
|
||||
1, 1,
|
||||
6, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
32, 24, 32, 32,
|
||||
32, 1, 2, 32,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
4, 2, 3, 256,
|
||||
328, 3, 5, 256,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
return conv2d_default_sizes
|
||||
|
||||
# Add a few large and rigorous convolution problem sizes
|
||||
def initialize_conv2d_rigorous_sizes(self, minimum_channel_size):
|
||||
sizes = []
|
||||
if False:
|
||||
sizes.append(Conv2DProblemSize.from_sizes(
|
||||
(1, 124, 224, 2 * minimum_channel_size),
|
||||
(24, 7, 7, 2 * minimum_channel_size),
|
||||
))
|
||||
|
||||
sizes.append(Conv2DProblemSize.from_sizes(
|
||||
(1, 233, 35, minimum_channel_size),
|
||||
(24, 7, 5, minimum_channel_size),
|
||||
))
|
||||
return sizes
|
||||
|
||||
# Add resent50 layers to unit testing sizes
|
||||
def initialize_conv2d_resnet50_sizes(self, batch_size):
|
||||
conv2d_problem_vector = []
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 56, 56, 64,
|
||||
256, 1, 1, 64,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 56, 56, 64,
|
||||
64, 1, 1, 64,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 56, 56, 64,
|
||||
64, 3, 3, 64,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 56, 56, 256,
|
||||
64, 1, 1, 256,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 56, 56, 256,
|
||||
512, 1, 1, 256,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 56, 56, 256,
|
||||
128, 1, 1, 256,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 28, 28, 128,
|
||||
128, 3, 3, 128,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 28, 28, 128,
|
||||
512, 1, 1, 128,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 28, 28, 512,
|
||||
128, 1, 1, 512,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 28, 28, 512,
|
||||
1024, 1, 1, 512,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 28, 28, 512,
|
||||
256, 1, 1, 512,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 14, 14, 256,
|
||||
256, 3, 3, 256,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 14, 14, 256,
|
||||
1024, 1, 1, 256,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 14, 14, 1024,
|
||||
256, 1, 1, 1024,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 14, 14, 1024,
|
||||
2048, 1, 1, 1024,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 14, 14, 1024,
|
||||
512, 1, 1, 1024,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 7, 7, 512,
|
||||
512, 3, 3, 512,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 7, 7, 512,
|
||||
2048, 1, 1, 512,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 7, 7, 2048,
|
||||
512, 1, 1, 2048,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
return conv2d_problem_vector
|
||||
|
||||
def initialize_conv2d_grouped_sizes(self):
|
||||
threadblock_n = 128
|
||||
threadblock_k = 32
|
||||
|
||||
sizes = []
|
||||
##########################################
|
||||
# One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
|
||||
# One CTA calculates a single group
|
||||
##########################################
|
||||
for cta_per_group_k in range(1, 4):
|
||||
for groups in range(2, 5):
|
||||
conv_k = cta_per_group_k * threadblock_n * groups
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 8, 8, threadblock_k * 2 * groups,
|
||||
conv_k, 3, 3, threadblock_k * 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
groups
|
||||
))
|
||||
|
||||
# Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 8, 8, threadblock_k,
|
||||
threadblock_n * 2, 3, 3, threadblock_k // 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
2
|
||||
))
|
||||
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 56, 56, 696,
|
||||
768, 3, 3, 232,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
3
|
||||
))
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 14, 14, 1392,
|
||||
1536, 3, 3, 232,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
3
|
||||
))
|
||||
|
||||
##########################################
|
||||
# One CTA calculate multiple groups: CTA::N % k_per_group = 0
|
||||
##########################################
|
||||
|
||||
# 2 groups per CTA
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 8, 8, threadblock_k * 4,
|
||||
threadblock_n, 3, 3, threadblock_k * 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
2
|
||||
))
|
||||
|
||||
# 2 groups per CTA and partial gemm_k
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 8, 8, threadblock_k,
|
||||
threadblock_n, 3, 3, threadblock_k // 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
2
|
||||
))
|
||||
|
||||
# 4 groups per CTA
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 8, 8, threadblock_k * 8,
|
||||
threadblock_n // 2, 3, 3, threadblock_k * 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
4
|
||||
))
|
||||
|
||||
# 4 groups per CTA and partial gemm_k
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 8, 8, threadblock_k * 2,
|
||||
threadblock_n // 2, 3, 3, threadblock_k // 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
4
|
||||
))
|
||||
|
||||
return sizes
|
||||
146
test/python/cutlass/conv2d/conv2d_sm80.py
Normal file
146
test/python/cutlass/conv2d/conv2d_sm80.py
Normal file
@ -0,0 +1,146 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Low-level functionality tests for Conv2d opreations on SM80
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from conv2d_test_utils import *
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is invalid for SM80 tests.')
|
||||
class Conv2dSm80(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
conv_problems = get_conv_problems()
|
||||
|
||||
|
||||
# Tests for optimized & analytic
|
||||
for conv_kind in ["fprop", "wgrad", "dgrad"]:
|
||||
# F16, simt
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="simt", threadblock_shape=[128, 128, 8],
|
||||
warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
|
||||
# F16, tensor op
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
|
||||
# F16, tensor op, analytic iterator
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
|
||||
# F16, tensor op, f32 output
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
|
||||
# F16, tensor op, different tile description
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 64, 32],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
|
||||
# F32, simt
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
|
||||
opclass="simt", threadblock_shape=[128, 128, 8],
|
||||
warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
|
||||
# Tf32, tensorop
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 16],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8]
|
||||
)
|
||||
# Split-K
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
|
||||
split_k_slices=2)
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
|
||||
split_k_slices=5)
|
||||
# Swizzling functor
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 64, 32],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)
|
||||
|
||||
# Tests for few channels and fixed channels
|
||||
# F16, tensor op, few channels
|
||||
for c, tb, stage, inst in zip([2, 1],
|
||||
[[128, 128, 64], [128, 128, 32]],
|
||||
[3, 2],
|
||||
[[16, 8, 16], [16, 8, 8]]):
|
||||
add_test(
|
||||
Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=tb,
|
||||
warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
|
||||
)
|
||||
# F16, tensor op, fixed channels
|
||||
for c in [8, 4, 2]:
|
||||
add_test(
|
||||
Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
|
||||
)
|
||||
|
||||
# Test activations
|
||||
for activation in ["relu", "leaky_relu"]:
|
||||
for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
|
||||
add_test(
|
||||
Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
|
||||
split_k_slices=split_k_slices, activation=activation)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
425
test/python/cutlass/conv2d/conv2d_test_utils.py
Normal file
425
test/python/cutlass/conv2d/conv2d_test_utils.py
Normal file
@ -0,0 +1,425 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Utility functions for Conv2d tests.
|
||||
"""
|
||||
|
||||
import torch
|
||||
|
||||
import cutlass
|
||||
from cutlass import (
|
||||
ConvKind,
|
||||
ConvMode,
|
||||
DataType,
|
||||
DataTypeNames,
|
||||
EpilogueScheduleSuffixes,
|
||||
KernelScheduleSuffixes,
|
||||
LayoutType,
|
||||
OpcodeClassNames,
|
||||
ShortDataTypeNames,
|
||||
ShortLayoutTypeNames,
|
||||
SplitKMode,
|
||||
)
|
||||
from cutlass.backend.utils.software import SubstituteTemplate
|
||||
from cutlass.shape import Conv2DProblemSize
|
||||
from cutlass.utils.datatypes import numpy_type, torch_type
|
||||
|
||||
from conv2d_problem_sizes import TestbedConv2dProblemSizes
|
||||
|
||||
|
||||
def get_name_conv2d(
|
||||
arch,
|
||||
conv_kind,
|
||||
element,
|
||||
element_accumulator,
|
||||
element_output,
|
||||
opclass,
|
||||
threadblock_shape,
|
||||
warp_count,
|
||||
instruction_shape,
|
||||
stages,
|
||||
iterator_algorithm,
|
||||
swizzle,
|
||||
split_k_mode,
|
||||
split_k_slices,
|
||||
activation
|
||||
):
|
||||
"""
|
||||
Generates a procedural name for a test case for conv2d
|
||||
|
||||
:param arch: compute capability of kernel being generated
|
||||
:type arch: int
|
||||
:param conv_kind: the convolution type (i.e. fprop, dgrad, wgrad)
|
||||
:type conv_kind: str
|
||||
:param iterator_algorithm: the iterator algorithm applied
|
||||
:type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
|
||||
:param element_a: data type of operand A
|
||||
:param element_b: data type of operand B
|
||||
:param element_c: data type of operand C
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpcodeClass
|
||||
:param threadblock_shape: indexable container of dimensions of threadblock tiles
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param stride_support: stride support of dgrad
|
||||
:param alignment: int
|
||||
:type alignment: int
|
||||
|
||||
:return: str
|
||||
"""
|
||||
if iterator_algorithm is None:
|
||||
iterator_algorithm = "AUTO"
|
||||
if swizzle is None:
|
||||
swizzle = 1
|
||||
name_format = "test_SM${arch}_Device_Conv2d_${conv_kind}_${iter_alg}_ImplicitGemm_${eA}nhwc_${eB}nhwc_${eC}nhwc_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${wM}x${wN}x${wK}_${IM}${IN}${IK}_stage${stages}_swizzle${swizzle}_${split_k_mode}${split_k_slices}_${activation}"
|
||||
|
||||
return SubstituteTemplate(
|
||||
name_format,
|
||||
{
|
||||
"arch": str(arch),
|
||||
"conv_kind": conv_kind,
|
||||
"iter_alg": iterator_algorithm,
|
||||
"eA": DataTypeNames[element],
|
||||
"eB": DataTypeNames[element],
|
||||
"eC": DataTypeNames[element_output],
|
||||
"opclass": opclass,
|
||||
"acc": DataTypeNames[element_accumulator],
|
||||
"tbM": str(threadblock_shape[0]),
|
||||
"tbN": str(threadblock_shape[1]),
|
||||
"tbK": str(threadblock_shape[2]),
|
||||
"wM": str(threadblock_shape[0] // warp_count[0]),
|
||||
"wN": str(threadblock_shape[1] // warp_count[1]),
|
||||
"wK": str(threadblock_shape[2] // warp_count[2]),
|
||||
"IM": str(instruction_shape[0]),
|
||||
"IN": str(instruction_shape[1]),
|
||||
"IK": str(instruction_shape[2]),
|
||||
"stages": str(stages),
|
||||
"swizzle": str(swizzle),
|
||||
"split_k_mode": split_k_mode,
|
||||
"split_k_slices": str(split_k_slices),
|
||||
"activation": activation
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def conv2d_few_channel_problemsizes(channels):
|
||||
problem_sizes = [
|
||||
Conv2DProblemSize(
|
||||
1, 8, 8, channels,
|
||||
16, 3, 3, channels,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 16, 16, channels,
|
||||
16, 3, 3, channels,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 16, 16, channels,
|
||||
16, 7, 7, channels,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 224, 224, channels,
|
||||
32, 7, 7, channels,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 224, 224, channels,
|
||||
64, 7, 7, channels,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 224, 224, channels,
|
||||
64, 5, 5, channels,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 224, 224, channels,
|
||||
64, 5, 5, channels,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
return problem_sizes
|
||||
|
||||
|
||||
def validate_problem_size(ps, conv_kind, split_k_slices):
|
||||
P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
|
||||
Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
|
||||
if P != ps.P or Q != ps.Q:
|
||||
return False
|
||||
|
||||
# Split-K (serial or parallel) is not supported for strided dgrad
|
||||
if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class Conv2dLauncherFrontend:
|
||||
def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
|
||||
self.operation = plan
|
||||
self.conv_kind = plan.conv_kind
|
||||
self.seed = seed
|
||||
self.backend = backend
|
||||
|
||||
self.dtype_A = plan._element_a
|
||||
self.dtype_B = plan._element_b
|
||||
self.dtype_C = plan._element_c
|
||||
self.dtype_acc = plan._element_accumulator
|
||||
self.layout_A = LayoutType.TensorNHWC
|
||||
self.layout_B = LayoutType.TensorNHWC
|
||||
self.layout_C = LayoutType.TensorNHWC
|
||||
self.layout_D = LayoutType.TensorNHWC
|
||||
|
||||
self.element_compute = DataType.f32
|
||||
|
||||
if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
|
||||
self.rand_max = 1
|
||||
else:
|
||||
self.rand_max = 4
|
||||
self.activation = plan.activation
|
||||
|
||||
def uniform_init(self, size, dtype):
|
||||
tensor = torch.ceil(
|
||||
torch.empty(size=size, dtype=torch_type(dtype), device="cuda").uniform_(-self.rand_max - 0.5, self.rand_max - 0.5)
|
||||
).to(memory_format=torch.channels_last)
|
||||
return tensor
|
||||
|
||||
def reference(self, ps, A, B, C, alpha, beta, activation):
|
||||
if self.conv_kind == ConvKind.Fprop:
|
||||
torch_result = alpha * torch.ops.aten.conv2d(
|
||||
A,
|
||||
B,
|
||||
stride=(ps.stride_h, ps.stride_w),
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
dilation=(ps.dilation_h, ps.dilation_w)
|
||||
) + beta * C
|
||||
elif self.conv_kind == ConvKind.Dgrad:
|
||||
torch_result = alpha * torch.nn.grad.conv2d_input(
|
||||
(ps.N, ps.C, ps.H, ps.W),
|
||||
B,
|
||||
A,
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
stride=(ps.stride_h, ps.stride_w)
|
||||
) + beta * C
|
||||
elif self.conv_kind == ConvKind.Wgrad:
|
||||
torch_result = alpha * torch.nn.grad.conv2d_weight(
|
||||
B,
|
||||
(ps.K, ps.C, ps.R, ps.S),
|
||||
A,
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
stride=(ps.stride_h, ps.stride_w)
|
||||
) + beta * C
|
||||
else:
|
||||
raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
|
||||
|
||||
if activation == cutlass.backend.epilogue.relu:
|
||||
torch_result = torch.nn.functional.relu(torch_result)
|
||||
elif activation == cutlass.backend.epilogue.leaky_relu:
|
||||
torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
|
||||
return torch_result
|
||||
|
||||
def run(self, ps, split_k_mode=SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
|
||||
if self.conv_kind == ConvKind.Fprop:
|
||||
tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
elif self.conv_kind == ConvKind.Dgrad:
|
||||
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
elif self.conv_kind == ConvKind.Wgrad:
|
||||
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
else:
|
||||
raise Exception(f"Conv kind {self.conv_kind} is not supported")
|
||||
|
||||
torch.manual_seed(self.seed)
|
||||
|
||||
tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
|
||||
tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
|
||||
tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
|
||||
tensor_D = torch.zeros_like(tensor_C).to(memory_format=torch.channels_last)
|
||||
self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
|
||||
stride=(ps.stride_h, ps.stride_w),
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
dilation=(ps.dilation_h, ps.dilation_w),
|
||||
alpha=alpha, beta=beta,
|
||||
split_k=(split_k_mode, split_k_slices))
|
||||
|
||||
tensor_D_ref = self.reference(ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
passed = torch.equal(tensor_D, tensor_D_ref)
|
||||
|
||||
return passed
|
||||
|
||||
|
||||
def add_test(
|
||||
cls,
|
||||
cc,
|
||||
conv_kind,
|
||||
problem_sizes,
|
||||
element,
|
||||
element_accumulator,
|
||||
element_output,
|
||||
opclass,
|
||||
threadblock_shape,
|
||||
warp_count,
|
||||
instruction_shape,
|
||||
stages,
|
||||
iterator_algorithm=None,
|
||||
swizzle=None,
|
||||
split_k_mode="serial",
|
||||
split_k_slices=1,
|
||||
activation = "identity"
|
||||
):
|
||||
"""Create a test-running function with the given specification"""
|
||||
test_name = get_name_conv2d(
|
||||
cc, conv_kind, element, element_accumulator,
|
||||
element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
|
||||
iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
|
||||
|
||||
def run(self):
|
||||
# Create the plan
|
||||
plan = cutlass.Conv2d(
|
||||
kind=conv_kind,
|
||||
element=element,
|
||||
element_accumulator=element_accumulator,
|
||||
element_C=element_output,
|
||||
element_D=element_output
|
||||
)
|
||||
|
||||
# Set the opclass
|
||||
plan.opclass = opclass
|
||||
# Set the tile description
|
||||
td = {
|
||||
"threadblock_shape": threadblock_shape,
|
||||
"warp_count": warp_count,
|
||||
"stages": stages,
|
||||
"instruction_shape": instruction_shape,
|
||||
}
|
||||
|
||||
plan.tile_description = td
|
||||
# Set iterator algorithm
|
||||
if iterator_algorithm is not None:
|
||||
plan.iterator_algorithm = iterator_algorithm
|
||||
# Set swizzling functor
|
||||
if swizzle is not None:
|
||||
plan.swizzling_stride = swizzle
|
||||
|
||||
if activation != "identity":
|
||||
if activation == "leaky_relu":
|
||||
plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
|
||||
else:
|
||||
plan.activation = getattr(cutlass.epilogue, activation)
|
||||
|
||||
conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="torch")
|
||||
|
||||
for ps in problem_sizes:
|
||||
if not validate_problem_size(ps, conv_kind, split_k_slices): continue
|
||||
|
||||
self.assertTrue(conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 2.0))
|
||||
|
||||
setattr(cls, test_name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
def get_conv_problems():
|
||||
# 64: minimum channel size
|
||||
conv_problems = TestbedConv2dProblemSizes(64).all
|
||||
|
||||
# Insert alignment 4 & 2 tests
|
||||
conv_problems += [
|
||||
Conv2DProblemSize(
|
||||
1, 4, 4, 12,
|
||||
8, 3, 3, 12,
|
||||
0, 0,
|
||||
3, 3,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 4, 4, 14,
|
||||
8, 3, 3, 14,
|
||||
0, 0,
|
||||
3, 3,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 23, 56, 98,
|
||||
128, 3, 3, 98,
|
||||
4, 5,
|
||||
3, 3,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
return conv_problems
|
||||
44
test/python/cutlass/conv2d/run_all_tests.py
Normal file
44
test/python/cutlass/conv2d/run_all_tests.py
Normal file
@ -0,0 +1,44 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import pathlib
|
||||
import unittest
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
loader = unittest.TestLoader()
|
||||
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
|
||||
tests = loader.discover(script_dir, 'conv2d_*.py')
|
||||
testRunner = unittest.runner.TextTestRunner()
|
||||
results = testRunner.run(tests)
|
||||
if not results.wasSuccessful():
|
||||
raise Exception('Test cases failed')
|
||||
308
test/python/cutlass/emit/pytorch.py
Normal file
308
test/python/cutlass/emit/pytorch.py
Normal file
@ -0,0 +1,308 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Tests emitting a CUTLASS kernel to a PyTorch CUDA extension
|
||||
"""
|
||||
|
||||
import random
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
|
||||
if cutlass.utils.datatypes.torch_available:
|
||||
import torch
|
||||
|
||||
|
||||
def _initialize(dtype, M: int, N: int, K: int):
|
||||
"""
|
||||
Utility function to initialize A, B, C, and D matrices corresponding to dimensions M, N, and K
|
||||
|
||||
:param dtype: data type of tensors
|
||||
:param M: M dimension of GEMM problem
|
||||
:type M: int
|
||||
:param N: N dimension of GEMM problem
|
||||
:type N: int
|
||||
:param K: N dimension of GEMM problem
|
||||
:type K: int
|
||||
|
||||
:return: initialized tensors A, B, C, and D
|
||||
:rtype: list
|
||||
"""
|
||||
sizes = [(M, K), (K, N), (M, N), (M, N)]
|
||||
return [torch.randint(-3, 3, size, device='cuda').to(dtype) for size in sizes]
|
||||
|
||||
|
||||
def _generate_problems(dtype, num):
|
||||
"""
|
||||
Utility function to generate `num` GEMMs of random sizes
|
||||
|
||||
:param dtype: data type of tensors
|
||||
:param num: number of GEMMs to generate
|
||||
:type num: int
|
||||
|
||||
:return: lists of A, B, C, and D tensors
|
||||
:rtype: list
|
||||
"""
|
||||
valid_sizes = [128, 256, 512, 1024]
|
||||
As, Bs, Cs, Ds = [], [], [], []
|
||||
for _ in range(num):
|
||||
M, N, K = [random.choice(valid_sizes) for _ in range(3)]
|
||||
A, B, C, D = _initialize(dtype, M, N, K)
|
||||
As.append(A)
|
||||
Bs.append(B)
|
||||
Cs.append(C)
|
||||
Ds.append(D)
|
||||
return As, Bs, Cs, Ds
|
||||
|
||||
def _generate_conv2d_problem(conv_kind, dtype, ps):
|
||||
"""
|
||||
Utility function to generate conv2d inputs
|
||||
|
||||
:param conv_kind: kind of convolution
|
||||
:type conv_kind: str
|
||||
:param dtype: data type of tensors
|
||||
:param problem_size: the conv2d problem size
|
||||
:type problem_size: cutlass.shape.Conv2DProblemSize
|
||||
|
||||
:return: initialized tensors A, B, C, and D
|
||||
:rtype: list
|
||||
"""
|
||||
if conv_kind == "fprop":
|
||||
tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
elif conv_kind == "dgrad":
|
||||
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
else:
|
||||
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
sizes = [tensor_A_size, tensor_B_size, tensor_C_size]
|
||||
return [torch.ceil(torch.empty(size, dtype=dtype, device='cuda').uniform_(-4.5, 3.5)).to(memory_format=torch.channels_last) for size in sizes]
|
||||
|
||||
|
||||
@unittest.skipIf(not cutlass.utils.datatypes.torch_available, 'PyTorch must be available to run PyTorch extension tests')
|
||||
class PyTorchExtensionTest(unittest.TestCase):
|
||||
|
||||
def test_gemm(self):
|
||||
random.seed(2023)
|
||||
|
||||
dtype = torch.float16
|
||||
plan = cutlass.op.Gemm(element=dtype, layout=cutlass.LayoutType.RowMajor)
|
||||
plan.activation = cutlass.epilogue.relu
|
||||
op = plan.construct()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
mod = cutlass.emit.pytorch(op, name='gemm_mod', cc=plan.cc, sourcedir=tmpdir, jit=True)
|
||||
|
||||
A, B, C, _ = _initialize(dtype, 1024, 256, 512)
|
||||
|
||||
D_ref = torch.nn.functional.relu(A @ B)
|
||||
D = mod.run(A, B)
|
||||
assert torch.allclose(D, D_ref)
|
||||
|
||||
D = mod.run(A, B, C)
|
||||
assert torch.allclose(D, D_ref)
|
||||
|
||||
D = mod.run(A, B, C, 1.0)
|
||||
assert torch.allclose(D, D_ref)
|
||||
|
||||
D = mod.run(A, B, C, 1.0, 0.0)
|
||||
assert torch.allclose(D, D_ref)
|
||||
|
||||
alpha = 2.0
|
||||
beta = -1.0
|
||||
D_ref = torch.nn.functional.relu((A @ B) * alpha + (beta * C))
|
||||
D = mod.run(A, B, C, alpha, beta)
|
||||
assert torch.allclose(D, D_ref)
|
||||
|
||||
def test_grouped_gemm(self):
|
||||
random.seed(2023)
|
||||
|
||||
dtype = torch.float16
|
||||
plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor)
|
||||
op = plan.construct()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
mod = cutlass.emit.pytorch(op, name='grouped_gemm_mod', cc=plan.cc, sourcedir=tmpdir, jit=True)
|
||||
|
||||
As, Bs, Cs, _ = _generate_problems(dtype, 50)
|
||||
|
||||
def check_all(X, Y):
|
||||
for x, y in zip(X, Y):
|
||||
assert torch.allclose(x, y)
|
||||
|
||||
Ds_ref = [a @ b for a, b in zip(As, Bs)]
|
||||
Ds = mod.run(As, Bs)
|
||||
check_all(Ds, Ds_ref)
|
||||
|
||||
Ds = mod.run(As, Bs, Cs)
|
||||
check_all(Ds, Ds_ref)
|
||||
|
||||
Ds = mod.run(As, Bs, Cs, 1.0)
|
||||
check_all(Ds, Ds_ref)
|
||||
|
||||
Ds = mod.run(As, Bs, Cs, 1.0, 0.0)
|
||||
check_all(Ds, Ds_ref)
|
||||
|
||||
alpha = 2.0
|
||||
beta = -1.0
|
||||
Ds_ref = [(a @ b) * alpha + (beta * c) for a, b, c in zip(As, Bs, Cs)]
|
||||
Ds = mod.run(As, Bs, Cs, alpha, beta)
|
||||
check_all(Ds, Ds_ref)
|
||||
|
||||
def test_conv2d_fprop(self):
|
||||
torch.manual_seed(2023)
|
||||
|
||||
dtype = torch.float16
|
||||
plan = cutlass.op.Conv2d(kind="fprop", element=dtype, element_accumulator=torch.float32)
|
||||
plan.activation = "relu"
|
||||
|
||||
op = plan.construct()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
mod = cutlass.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
|
||||
|
||||
problem_size = cutlass.shape.Conv2DProblemSize(
|
||||
1, 4, 4, 16,
|
||||
8, 3, 3, 16,
|
||||
0, 0,
|
||||
3, 3,
|
||||
1, 1
|
||||
)
|
||||
|
||||
A, B, C = _generate_conv2d_problem("fprop", dtype, problem_size)
|
||||
stride = (problem_size.stride_h, problem_size.stride_w)
|
||||
padding = (problem_size.pad_h, problem_size.pad_w)
|
||||
|
||||
alpha = 1.0
|
||||
beta = 0.5
|
||||
|
||||
D_ref = alpha * torch.ops.aten.conv2d(
|
||||
A, B, stride=stride, padding=padding
|
||||
) + beta * C
|
||||
D_ref = torch.nn.functional.relu(D_ref)
|
||||
D = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta)
|
||||
|
||||
assert torch.allclose(D, D_ref)
|
||||
|
||||
# Test serial split-K
|
||||
D_serial_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
|
||||
assert torch.allclose(D, D_serial_split_k)
|
||||
|
||||
# Test parallel split-K
|
||||
D_parallel_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
|
||||
assert torch.allclose(D, D_parallel_split_k)
|
||||
|
||||
|
||||
def test_conv2d_dgrad(self):
|
||||
torch.manual_seed(2023)
|
||||
dtype = torch.float16
|
||||
plan = cutlass.op.Conv2d(kind="dgrad", element=dtype, element_accumulator=torch.float32)
|
||||
|
||||
op = plan.construct()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
mod = cutlass.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
|
||||
|
||||
problem_size = cutlass.shape.Conv2DProblemSize(
|
||||
1, 4, 4, 16,
|
||||
8, 3, 3, 16,
|
||||
0, 0,
|
||||
3, 3,
|
||||
1, 1,
|
||||
cutlass.ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
)
|
||||
|
||||
A, B, C = _generate_conv2d_problem("dgrad", dtype, problem_size)
|
||||
stride = (problem_size.stride_h, problem_size.stride_w)
|
||||
padding = (problem_size.pad_h, problem_size.pad_w)
|
||||
|
||||
alpha = 1.0
|
||||
beta = 0.5
|
||||
input_size = (problem_size.N, problem_size.C, problem_size.H, problem_size.W)
|
||||
D_ref = alpha * torch.nn.grad.conv2d_input(
|
||||
input_size, B, A,
|
||||
stride=stride, padding=padding
|
||||
) + beta * C
|
||||
D = mod.run(input_size, A, B, C, stride, padding, alpha=alpha, beta=beta, )
|
||||
|
||||
assert torch.allclose(D, D_ref)
|
||||
|
||||
def test_conv2d_wgrad(self):
|
||||
torch.manual_seed(2023)
|
||||
dtype = torch.float16
|
||||
plan = cutlass.op.Conv2d(kind="wgrad", element=dtype, element_accumulator=torch.float32)
|
||||
|
||||
op = plan.construct()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
mod = cutlass.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
|
||||
|
||||
problem_size = cutlass.shape.Conv2DProblemSize(
|
||||
1, 4, 4, 16,
|
||||
8, 3, 3, 16,
|
||||
0, 0,
|
||||
3, 3,
|
||||
1, 1,
|
||||
cutlass.ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
)
|
||||
|
||||
A, B, C = _generate_conv2d_problem("wgrad", dtype, problem_size)
|
||||
stride = (problem_size.stride_h, problem_size.stride_w)
|
||||
padding = (problem_size.pad_h, problem_size.pad_w)
|
||||
|
||||
alpha = 1.0
|
||||
beta = 0.5
|
||||
weight_size = (problem_size.K, problem_size.C, problem_size.R, problem_size.S)
|
||||
D_ref = alpha * torch.nn.grad.conv2d_weight(
|
||||
B, weight_size, A,
|
||||
stride=stride, padding=padding
|
||||
) + beta * C
|
||||
D = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta)
|
||||
|
||||
assert torch.allclose(D, D_ref)
|
||||
|
||||
# Test serial split-K
|
||||
D_serial_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
|
||||
assert torch.allclose(D, D_serial_split_k)
|
||||
|
||||
# Test parallel split-K
|
||||
D_parallel_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
|
||||
assert torch.allclose(D, D_parallel_split_k)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
100
test/python/cutlass/evt/evt_compute_sm80_90.py
Normal file
100
test/python/cutlass/evt/evt_compute_sm80_90.py
Normal file
@ -0,0 +1,100 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
"""
|
||||
Unit test for compute node in SM90
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend import *
|
||||
from cutlass.epilogue import *
|
||||
from cutlass import swizzle
|
||||
|
||||
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
|
||||
class TestEVTComputeSM90(EVTTestCaseBase):
|
||||
|
||||
def test_arith(self):
|
||||
"""
|
||||
Test Arithmatic op
|
||||
"""
|
||||
def evt_arith_compute(accum, C, alpha, beta, gamma):
|
||||
D = ((accum + C) * alpha - gamma) / beta
|
||||
return D
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 1.5,
|
||||
"beta": 0.5,
|
||||
"gamma": 2.5,
|
||||
"D": self.fake_tensor(self.element, (l, m, n))
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_arith_compute, example_inputs)
|
||||
input_keys = ["C", "alpha", "beta", "gamma"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_func_call(self):
|
||||
"""
|
||||
Test Function call
|
||||
"""
|
||||
def evt_func_call(accum, C, alpha, beta, gamma):
|
||||
D = multiply_add(relu(accum + alpha) + C, beta, gamma)
|
||||
return D
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 1.5,
|
||||
"beta": 0.5,
|
||||
"gamma": 2.5,
|
||||
"D": self.fake_tensor(self.element, (l, m, n))
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_func_call, example_inputs)
|
||||
input_keys = ["C", "alpha", "beta", "gamma"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
173
test/python/cutlass/evt/evt_layout_sm80_90.py
Normal file
173
test/python/cutlass/evt/evt_layout_sm80_90.py
Normal file
@ -0,0 +1,173 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
"""
|
||||
Unit test for store nodes in SM90
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend import *
|
||||
from cutlass.epilogue import *
|
||||
|
||||
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
|
||||
class TestEVTLayoutSM90(EVTTestCaseBase):
|
||||
|
||||
def test_permute_1(self):
|
||||
"""
|
||||
Returning a tensor with shape [m, n]
|
||||
"""
|
||||
def evt_permute(accum, alpha, C):
|
||||
F = alpha * accum
|
||||
F_permute = permute(F, indices=(0, 2, 1))
|
||||
D_permute = F_permute + permute(C, indices=(0, 2, 1))
|
||||
D = permute(D_permute, indices=(0, 2, 1))
|
||||
return D, F
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 0.5,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_permute, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
@unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
|
||||
def test_permute_2(self):
|
||||
"""
|
||||
Returning a tensor with shape [m, n]
|
||||
"""
|
||||
def evt_permute(accum, alpha, C):
|
||||
F = alpha * accum
|
||||
F_permute = permute(F, indices=(0, 2, 1))
|
||||
D = F_permute + C
|
||||
return D, F
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 0.5,
|
||||
"C": self.fake_tensor(self.element, (l, n, m)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"D": self.fake_tensor(self.element, (l, n, m)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_permute, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
@unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
|
||||
def test_permute_3(self):
|
||||
"""
|
||||
Returning a tensor with shape [m, n]
|
||||
"""
|
||||
def evt_permute(accum, alpha, C):
|
||||
F = alpha * accum
|
||||
F_permute = permute(F, indices=(1, 0, 2))
|
||||
D = F_permute + C
|
||||
return D, F
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 0.5,
|
||||
"C": self.fake_tensor(self.element, (m, l, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"D": self.fake_tensor(self.element, (m, l, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_permute, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_reshape(self):
|
||||
"""
|
||||
Test reshape
|
||||
"""
|
||||
def evt_reshape(accum, alpha, TensorE):
|
||||
F = alpha * accum
|
||||
E_reshape = reshape(TensorE, new_shape=(512, 1))
|
||||
D = F + E_reshape
|
||||
return D
|
||||
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
|
||||
"alpha": 0.5,
|
||||
"TensorE": self.fake_tensor(self.element, (16, 32)),
|
||||
"D": self.fake_tensor(self.element, (self.l, self.m, self.n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
|
||||
input_keys = ["alpha", "TensorE"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify(self.problem_size, input_keys, result_keys, self.l)
|
||||
|
||||
def test_reshape2(self):
|
||||
"""
|
||||
Test reshape
|
||||
"""
|
||||
def evt_reshape(accum, alpha, TensorE):
|
||||
F = alpha * accum
|
||||
F_reshape = reshape(F, new_shape=(2, 3, 512, 256))
|
||||
D = F_reshape + TensorE
|
||||
return D
|
||||
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
|
||||
"alpha": 0.5,
|
||||
"TensorE": self.fake_tensor(self.element, (2, 3, 1, self.n)),
|
||||
"D": self.fake_tensor(self.element, (2, 3, self.m, self.n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
|
||||
input_keys = ["alpha", "TensorE"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify(self.problem_size, input_keys, result_keys, self.l)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
142
test/python/cutlass/evt/evt_load_sm80_90.py
Normal file
142
test/python/cutlass/evt/evt_load_sm80_90.py
Normal file
@ -0,0 +1,142 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
"""
|
||||
Unit test for load nodes in SM90
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend import *
|
||||
from cutlass.epilogue import *
|
||||
|
||||
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
|
||||
class TestEVTLoadSM90(EVTTestCaseBase):
|
||||
|
||||
def test_tensor_load(self):
|
||||
"""
|
||||
Load extra tensor with shape [m, n]
|
||||
"""
|
||||
def evt_tensor_load(accum, C, aux, aux_batch):
|
||||
D = accum + C + aux + aux_batch
|
||||
return D
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"aux": self.fake_tensor(self.element, (m, n)),
|
||||
"aux_batch": self.fake_tensor(np.float32, (l, m, n)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_tensor_load, example_inputs)
|
||||
input_keys = ["C", "aux", "aux_batch"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_row_broadcast(self):
|
||||
"""
|
||||
Load extra tensor with shape [1, n]
|
||||
"""
|
||||
def evt_row_broadcast(accum, C, bias, bias_batch):
|
||||
D = accum + C + bias + bias_batch
|
||||
return D
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"bias": self.fake_tensor(self.element, (n,)),
|
||||
"bias_batch": self.fake_tensor(np.float32, (l, 1, n)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_row_broadcast, example_inputs)
|
||||
input_keys = ["C", "bias", "bias_batch"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_column_broadcast(self):
|
||||
"""
|
||||
Load extra tensor with shape [m, 1]
|
||||
"""
|
||||
def evt_column_broadcast(accum, C, bias, bias_batch):
|
||||
D = accum + C + bias + bias_batch
|
||||
return D
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"bias": self.fake_tensor(self.element, (m, 1)),
|
||||
"bias_batch": self.fake_tensor(np.float32, (l, m, 1)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_column_broadcast, example_inputs)
|
||||
input_keys = ["C", "bias", "bias_batch"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_scalar_broadcast(self):
|
||||
"""
|
||||
Load extra tensor with shape [1, 1]
|
||||
"""
|
||||
def evt_scalar_broadcast(accum, C, alpha, alpha_batch):
|
||||
D = accum + C + alpha + alpha_batch
|
||||
return D
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 0.5,
|
||||
"alpha_batch": self.fake_tensor(np.float32, (l, 1, 1)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_scalar_broadcast, example_inputs)
|
||||
input_keys = ["C", "alpha", "alpha_batch"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
274
test/python/cutlass/evt/evt_mixed_sm80_90.py
Normal file
274
test/python/cutlass/evt/evt_mixed_sm80_90.py
Normal file
@ -0,0 +1,274 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
"""
|
||||
Unittest for mixed types of nodes in SM90
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend import *
|
||||
from cutlass.epilogue import *
|
||||
from cutlass.swizzle import ThreadblockSwizzleStreamK
|
||||
|
||||
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
|
||||
class TestEVTMixedSM90(EVTTestCaseBase):
|
||||
def test_mixed_dag(self):
|
||||
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
|
||||
F = alpha * accum + (beta * C + aux)
|
||||
F_row_max = max(F, dim=[0, 1])
|
||||
E = relu(F + 1) + cbias + rbias
|
||||
E_col_max = max(E, dim=[0, 2])
|
||||
D = E + F
|
||||
return D, F, F_row_max, E_col_max
|
||||
|
||||
if device_cc() == 80:
|
||||
aligments = [2, 4, 8]
|
||||
else:
|
||||
# Sm90 EVT currently only supports 128-bit alignment
|
||||
aligments = [8,]
|
||||
for align in aligments:
|
||||
for m, n, k, l in self.get_problem_sizes(align):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(self.element, (l, m, n)),
|
||||
"cbias": self.fake_tensor(self.element, (m, 1)),
|
||||
"rbias": self.fake_tensor(self.element, (n,)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
|
||||
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs)
|
||||
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
|
||||
result_keys = ["D", "F", "F_row_max", "E_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
|
||||
def test_mixed_dag_float(self):
|
||||
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
|
||||
F = alpha * accum + (beta * C + aux)
|
||||
F_row_max = max(F, dim=[0, 1])
|
||||
E = relu(F + 1) + cbias + rbias
|
||||
E_col_max = max(E, dim=[0, 2])
|
||||
D = E + F
|
||||
return D, F, F_row_max, E_col_max
|
||||
|
||||
for align in [3, 2, 4]:
|
||||
for m, n, k, l in self.get_problem_sizes(align):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(np.float32, (l, m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(np.float32, (l, m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(np.float32, (l, m, n)),
|
||||
"cbias": self.fake_tensor(np.float32, (m, 1)),
|
||||
"rbias": self.fake_tensor(np.float32, (n,)),
|
||||
"D": self.fake_tensor(np.float32, (l, m, n)),
|
||||
"F": self.fake_tensor(np.float32, (l, m, n)),
|
||||
"F_row_max": self.fake_tensor(np.float32, (n,)),
|
||||
"E_col_max": self.fake_tensor(np.float32, (m, 1))
|
||||
}
|
||||
launcher = EVTTestBed(DataType.f32, evt_mixed_dag, example_inputs)
|
||||
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
|
||||
result_keys = ["D", "F", "F_row_max", "E_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
|
||||
def test_mixed_dag_stage2(self):
|
||||
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
|
||||
F = alpha * accum + (beta * C + aux)
|
||||
F_row_max = max(F, dim=[0, 1])
|
||||
E = relu(F + 1) + cbias + rbias
|
||||
E_col_max = max(E, dim=[0, 2])
|
||||
D = E + F
|
||||
return D, F, F_row_max, E_col_max
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(self.element, (l, m, n)),
|
||||
"cbias": self.fake_tensor(self.element, (m, 1)),
|
||||
"rbias": self.fake_tensor(self.element, (n,)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
|
||||
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, epilogue_stages=2)
|
||||
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
|
||||
result_keys = ["D", "F", "F_row_max", "E_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
|
||||
def test_mixed_dag_partition_k(self):
|
||||
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
|
||||
F = alpha * accum + (beta * C + aux)
|
||||
F_row_max = max(F, dim=[0, 1])
|
||||
E = relu(F + 1) + cbias + rbias
|
||||
E_col_max = max(E, dim=[0, 2])
|
||||
D = E + F
|
||||
return D, F, F_row_max, E_col_max
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(self.element, (l, m, n)),
|
||||
"cbias": self.fake_tensor(self.element, (m, 1)),
|
||||
"rbias": self.fake_tensor(self.element, (n,)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
|
||||
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
|
||||
}
|
||||
|
||||
tile_description = {
|
||||
"threadblock_shape": [128, 128, 64],
|
||||
"warp_count": [2, 2, 2]
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, tile_description=tile_description, epilogue_stages=2)
|
||||
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
|
||||
result_keys = ["D", "F", "F_row_max", "E_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
|
||||
def test_mixed_dag_stream_k(self):
|
||||
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
|
||||
F = alpha * accum + (beta * C + aux)
|
||||
F_row_max = max(F, dim=[0, 1])
|
||||
E = relu(F + 1) + cbias + rbias
|
||||
E_col_max = max(E, dim=[0, 2])
|
||||
D = E + F
|
||||
return D, F, F_row_max, E_col_max
|
||||
|
||||
# High per-sm occupancy tile_description
|
||||
tile_description = {
|
||||
"threadblock_shape": [128, 128, 32],
|
||||
"warp_count": [2, 2, 1],
|
||||
"stages": 3
|
||||
}
|
||||
tds = [None, tile_description]
|
||||
for td in tds:
|
||||
for m, n, k, l in self.get_problem_sizes(8, k=960, batch_count=[1, 3]):
|
||||
if l == 1:
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(self.element, (m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(self.element, (m, n)),
|
||||
"cbias": self.fake_tensor(self.element, (m, 1)),
|
||||
"rbias": self.fake_tensor(self.element, (n,)),
|
||||
"D": self.fake_tensor(self.element, (m, n)),
|
||||
"F": self.fake_tensor(self.element, (m, n)),
|
||||
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
|
||||
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
|
||||
}
|
||||
else:
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(self.element, (l, m, n)),
|
||||
"cbias": self.fake_tensor(self.element, (m, 1)),
|
||||
"rbias": self.fake_tensor(self.element, (n,)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
|
||||
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
|
||||
}
|
||||
|
||||
if td is not None:
|
||||
launcher = EVTTestBed(
|
||||
self.element, evt_mixed_dag, example_inputs,
|
||||
tile_description=td,
|
||||
swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
|
||||
else:
|
||||
launcher = EVTTestBed(
|
||||
self.element, evt_mixed_dag, example_inputs,
|
||||
swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
|
||||
|
||||
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
|
||||
result_keys = ["D", "F", "F_row_max", "E_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_mixed_dag_no_batch(self):
|
||||
def evt_mixed_dag_no_batch(accum, alpha, C, beta, aux, cbias, rbias):
|
||||
F = alpha * accum + (beta * C + aux)
|
||||
F_row_max = max(F, dim=[0, 1])
|
||||
E = relu(F + 1) + cbias + rbias
|
||||
E_col_max = max(E, dim=[0, 2])
|
||||
D = E + F
|
||||
return D, F, F_row_max, E_col_max
|
||||
|
||||
for m, n, k, _ in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(self.element, (m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(self.element, (m, n)),
|
||||
"cbias": self.fake_tensor(self.element, (m, 1)),
|
||||
"rbias": self.fake_tensor(self.element, (n,)),
|
||||
"D": self.fake_tensor(self.element, (m, n)),
|
||||
"F": self.fake_tensor(self.element, (m, n)),
|
||||
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
|
||||
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_mixed_dag_no_batch, example_inputs)
|
||||
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
|
||||
result_keys = ["D", "F", "F_row_max", "E_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, 1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
155
test/python/cutlass/evt/evt_store_sm80_90.py
Normal file
155
test/python/cutlass/evt/evt_store_sm80_90.py
Normal file
@ -0,0 +1,155 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
"""
|
||||
Unit test for store nodes in SM90
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend import *
|
||||
from cutlass.epilogue import *
|
||||
|
||||
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
|
||||
class TestEVTStoreSM90(EVTTestCaseBase):
|
||||
|
||||
def test_aux_store(self):
|
||||
"""
|
||||
Returning a tensor with shape [m, n]
|
||||
"""
|
||||
def evt_aux_store(accum, alpha, C):
|
||||
F = alpha * accum
|
||||
D = F + C
|
||||
return D, F
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 0.5,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_col_reduce(self):
|
||||
"""
|
||||
Reduction [m, n] -> [m, 1]
|
||||
"""
|
||||
def evt_row_reduce(accum, alpha, C):
|
||||
acc_row_max = max(accum, dim=[2,])
|
||||
F = alpha * accum
|
||||
F_row_max = max(F, dim=[0, 2])
|
||||
D = F + C
|
||||
return D, F_row_max, acc_row_max
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 2.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F_row_max": self.fake_tensor(np.float32, (m, 1)),
|
||||
"acc_row_max": self.fake_tensor(np.float32, (l, m, 1)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_row_reduce, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F_row_max", "acc_row_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_row_reduce(self):
|
||||
"""
|
||||
Reduction [m, n] -> [n]
|
||||
"""
|
||||
def evt_col_reduce(accum, alpha, C):
|
||||
acc_col_max = max(accum, dim=[1,])
|
||||
F = alpha * accum
|
||||
F_col_max = max(F, dim=[0, 1])
|
||||
D = F + C
|
||||
return D, F_col_max, acc_col_max
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 2.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F_col_max": self.fake_tensor(np.float32, (n,)),
|
||||
"acc_col_max": self.fake_tensor(np.float32, (l, 1, n)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_col_reduce, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F_col_max", "acc_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_scalar_reduce(self):
|
||||
"""
|
||||
Reduction [m, n] -> [1,]
|
||||
"""
|
||||
def evt_scalar_reduce(accum, alpha, C):
|
||||
acc_max = max(accum, dim=[1, 2])
|
||||
F = alpha * accum
|
||||
F_max = max(F, dim=[0, 1, 2])
|
||||
D = F + C
|
||||
return D, F_max, acc_max
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 2.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"acc_max": self.fake_tensor(np.float32, (l, 1, 1)),
|
||||
"F_max": self.fake_tensor(np.float32, (1,)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_scalar_reduce, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F_max", "acc_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
44
test/python/cutlass/evt/run_all_tests.py
Normal file
44
test/python/cutlass/evt/run_all_tests.py
Normal file
@ -0,0 +1,44 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import pathlib
|
||||
import unittest
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
loader = unittest.TestLoader()
|
||||
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
|
||||
tests = loader.discover(script_dir, 'evt_*.py')
|
||||
testRunner = unittest.runner.TextTestRunner()
|
||||
results = testRunner.run(tests)
|
||||
if not results.wasSuccessful():
|
||||
raise Exception('Test cases failed')
|
||||
230
test/python/cutlass/evt/utils/evt_testbed.py
Normal file
230
test/python/cutlass/evt/utils/evt_testbed.py
Normal file
@ -0,0 +1,230 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 20123 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
"""
|
||||
Testbed classes of EVT
|
||||
"""
|
||||
|
||||
import torch
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass import Tensor
|
||||
import cutlass.backend.evt
|
||||
from cutlass.profiler import CUDAEventProfiler
|
||||
from cutlass.shape import GemmCoord
|
||||
from cutlass.utils.datatypes import torch_type
|
||||
|
||||
|
||||
class EVTReferenceModule:
|
||||
def __init__(self, layout_A, layout_B, layout_C, epilogue_visitor):
|
||||
self.layout_A = layout_A
|
||||
self.layout_B = layout_B
|
||||
self.layout_C = layout_C
|
||||
self.epilogue_visitor = epilogue_visitor
|
||||
|
||||
def run(self, A, B, C, problem_size, alpha, beta, batch=1):
|
||||
if self.layout_A == cutlass.LayoutType.RowMajor:
|
||||
A_row = A.view((batch, problem_size.m, problem_size.k))
|
||||
else:
|
||||
A_col = A.view((batch, problem_size.k, problem_size.m))
|
||||
A_row = torch.permute(A_col, (0, 2, 1))
|
||||
|
||||
if self.layout_B == cutlass.LayoutType.RowMajor:
|
||||
B_row = B.view((batch, problem_size.k, problem_size.n))
|
||||
else:
|
||||
B_col = B.view((batch, problem_size.n, problem_size.k))
|
||||
B_row = torch.permute(B_col, (0, 2, 1))
|
||||
|
||||
if self.layout_C == cutlass.LayoutType.RowMajor:
|
||||
C_row = C.view((batch, problem_size.m, problem_size.n))
|
||||
else:
|
||||
C_col = C.view((batch, problem_size.n, problem_size.m))
|
||||
C_row = torch.permute(C_col, (0, 2, 1))
|
||||
|
||||
out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
|
||||
|
||||
if self.layout_C == cutlass.LayoutType.ColumnMajor:
|
||||
out = torch.permute(out_row, (0, 2, 1))
|
||||
else:
|
||||
out = out_row
|
||||
|
||||
return torch.flatten(out)
|
||||
|
||||
def __call__(self, A, B, C, problem_size, batch=1, epilogue_args=None):
|
||||
# Running the mainloop
|
||||
accum = self.run(
|
||||
A, B, C, problem_size, 1.0, 0.0, batch=batch
|
||||
).reshape(batch, problem_size.m, problem_size.n)
|
||||
|
||||
# Running the epilogue
|
||||
epilogue_args["accum"] = accum
|
||||
references = self.epilogue_visitor(**epilogue_args)
|
||||
|
||||
# Return the results
|
||||
if not isinstance(references, tuple):
|
||||
references = (references,)
|
||||
return references
|
||||
|
||||
|
||||
class EVTTestBed:
|
||||
"""
|
||||
Epilogue Visitor Testbed
|
||||
"""
|
||||
def __init__(self, element, evt_fn, example_inputs, profile=False, **kwargs) -> None:
|
||||
self.element = element
|
||||
layout = cutlass.LayoutType.RowMajor
|
||||
self.example_inputs = example_inputs
|
||||
|
||||
# Create the Gemm plan
|
||||
self.plan = cutlass.op.Gemm(element=element, layout=layout, element_accumulator=torch.float32)
|
||||
|
||||
if "tile_description" in kwargs:
|
||||
self.plan.tile_description = kwargs["tile_description"]
|
||||
|
||||
if "swizzling_functor" in kwargs:
|
||||
self.plan.swizzling_functor = kwargs["swizzling_functor"]
|
||||
|
||||
# Compile the epilogue visitor
|
||||
epilogue_visitor = cutlass.epilogue.trace(evt_fn, example_inputs)
|
||||
if "epilogue_stages" in kwargs:
|
||||
epilogue_visitor.epilogue_stages = kwargs["epilogue_stages"]
|
||||
self.plan.epilogue_visitor = epilogue_visitor
|
||||
|
||||
# Reference model
|
||||
self.reference_fn = EVTReferenceModule(layout, layout, layout, epilogue_visitor)
|
||||
|
||||
self.profile = profile
|
||||
|
||||
def get_torch_tensor(self, shape, dtype=None, fill=None):
|
||||
if dtype is None:
|
||||
dtype = self.element
|
||||
|
||||
dtype = torch_type(dtype)
|
||||
if fill is None:
|
||||
return torch.ceil(
|
||||
torch.empty(size=shape, dtype=dtype, device="cuda").uniform_(-4.5, 3.5)
|
||||
)
|
||||
else:
|
||||
return torch.full(shape, fill, dtype=dtype, device="cuda")
|
||||
|
||||
def verify(self, problem_size, input_keys, result_keys, batch_count=1):
|
||||
"""
|
||||
Verify the results
|
||||
"""
|
||||
problem_size = GemmCoord(*problem_size)
|
||||
|
||||
# Initiate the GEMM arguments
|
||||
tensor_A = self.get_torch_tensor((batch_count, problem_size.m, problem_size.k))
|
||||
tensor_B = self.get_torch_tensor((batch_count, problem_size.k, problem_size.n))
|
||||
|
||||
# Initialize the epilogue args
|
||||
epilogue_args = {}
|
||||
for key in self.example_inputs.keys():
|
||||
if key in input_keys:
|
||||
tensor = self.example_inputs[key]
|
||||
if isinstance(tensor, Tensor):
|
||||
epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element)
|
||||
else:
|
||||
epilogue_args[key] = tensor
|
||||
elif key in result_keys:
|
||||
tensor = self.example_inputs[key]
|
||||
if isinstance(tensor, Tensor):
|
||||
if "max" in key:
|
||||
fill = -1000
|
||||
else:
|
||||
fill = 0
|
||||
epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element, fill=fill)
|
||||
else:
|
||||
epilogue_args[key] = tensor
|
||||
|
||||
tensor_D = epilogue_args["D"]
|
||||
if "C" in epilogue_args:
|
||||
tensor_C = epilogue_args["C"]
|
||||
else:
|
||||
tensor_C = tensor_D
|
||||
# Run the device kernel
|
||||
self.plan.run(tensor_A, tensor_B, tensor_C, tensor_D, visitor_args=epilogue_args)
|
||||
|
||||
# Run the host reference
|
||||
evt_args_inputs = {}
|
||||
for key in input_keys:
|
||||
evt_args_inputs[key] = epilogue_args[key]
|
||||
|
||||
reference_results = self.reference_fn(
|
||||
tensor_A, tensor_B, tensor_C, problem_size, batch_count, evt_args_inputs)
|
||||
|
||||
# Compare the results
|
||||
for result, ref in zip(result_keys, reference_results):
|
||||
assert torch.equal(epilogue_args[result].flatten(), ref.flatten())
|
||||
|
||||
# Run profile
|
||||
if self.profile:
|
||||
profiler = CUDAEventProfiler(
|
||||
self.plan, 100, 100, tensor_A, tensor_B, tensor_C, tensor_D,
|
||||
visitor_args = epilogue_args
|
||||
)
|
||||
print(f"Cutlass Python Duration: {profiler()}")
|
||||
|
||||
|
||||
class EVTTestCaseBase(unittest.TestCase):
|
||||
"""
|
||||
Base class for EVT Unittest
|
||||
"""
|
||||
def __init__(self, methodName: str = "runTest", lmnk=(6, 512, 256, 128)) -> None:
|
||||
super().__init__(methodName)
|
||||
|
||||
self.element = cutlass.DataType.f16
|
||||
self.l, self.m, self.n, self.k = lmnk
|
||||
|
||||
self.problem_size = (self.m, self.n, self.k)
|
||||
|
||||
torch.random.manual_seed(42)
|
||||
|
||||
def fake_tensor(self, element, shape):
|
||||
return Tensor(element=element, shape=shape, layout_tag=cutlass.LayoutType.RowMajor)
|
||||
|
||||
def get_problem_sizes(self, alignment, k=None, batch_count=[3,]):
|
||||
k = k if k else self.k
|
||||
problem_size_m = [alignment, 512 - 3 * alignment]
|
||||
problem_size_n = [alignment, 512 - alignment]
|
||||
if alignment % 8 == 0:
|
||||
problem_size_m.append(768)
|
||||
problem_size_n.append(768)
|
||||
problem_size_l = batch_count
|
||||
problem_sizes = []
|
||||
for m in problem_size_m:
|
||||
for n in problem_size_n:
|
||||
for l in problem_size_l:
|
||||
problem_sizes.append((m, n, k, l))
|
||||
|
||||
return problem_sizes
|
||||
134
test/python/cutlass/gemm/gemm_batched.py
Normal file
134
test/python/cutlass/gemm/gemm_batched.py
Normal file
@ -0,0 +1,134 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
High-level tests for running batched GEMMs
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
import logging
|
||||
from math import prod
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import torch
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
|
||||
torch.manual_seed(2023)
|
||||
|
||||
|
||||
def pytorch_reference(A, B, C, alpha, beta):
|
||||
# Get the batch count. Assume that any of A, B, and C
|
||||
# with a batch dimension ahve matching batch count. Thus,
|
||||
# we break out of the loop once we have found the first
|
||||
# tensor containing a batch dimension.
|
||||
batch_count = (1,)
|
||||
for tensor in [A, B, C]:
|
||||
if len(tensor.shape) > 2:
|
||||
batch_count = tensor.shape[:-2]
|
||||
break
|
||||
|
||||
int_batch_count = prod(batch_count)
|
||||
|
||||
def add_batch(tensor):
|
||||
if len(tensor.shape) == 2:
|
||||
return tensor.unsqueeze(0).repeat(int_batch_count, 1, 1)
|
||||
else:
|
||||
return tensor.reshape(-1, tensor.size(-2), tensor.size(-1))
|
||||
|
||||
# Reshape tensors to have batch dimension
|
||||
A = add_batch(A)
|
||||
B = add_batch(B)
|
||||
C = add_batch(C)
|
||||
|
||||
ret = (torch.bmm(A, B) * alpha) + (C * beta)
|
||||
reshape_vals = batch_count + C.shape[-2:]
|
||||
return ret.reshape(*reshape_vals)
|
||||
|
||||
|
||||
def initialize(rows, cols, batch):
|
||||
tensor = torch.randint(-3, 3, size=(rows*cols*prod(batch),), device='cuda').half()
|
||||
if len(batch) > 0 and prod(batch) > 1:
|
||||
reshape_vals = batch + (rows, cols)
|
||||
return tensor.reshape(*reshape_vals)
|
||||
else:
|
||||
return tensor.reshape(rows, cols)
|
||||
|
||||
|
||||
class GemmF16Batched(unittest.TestCase):
|
||||
def run_batched(self, batch_count: tuple, batch_A: bool, batch_B: bool, batch_C: bool):
|
||||
M = 512
|
||||
N = 256
|
||||
K = 128
|
||||
alpha = 1.
|
||||
beta = 2.
|
||||
|
||||
A = initialize(M, K, batch_count if batch_A else (1,))
|
||||
B = initialize(K, N, batch_count if batch_B else (1,))
|
||||
C = initialize(M, N, batch_count if batch_C else (1,))
|
||||
D = initialize(M, N, batch_count)
|
||||
|
||||
plan = cutlass.op.Gemm(A=A, B=B, C=C, D=D, element_accumulator=cutlass.DataType.f32)
|
||||
plan.run(A, B, C, D, alpha, beta)
|
||||
reference = pytorch_reference(A, B, C, alpha, beta)
|
||||
assert reference.equal(D)
|
||||
|
||||
def test_batched_ABC(self):
|
||||
self.run_batched((3,), True, True, True)
|
||||
self.run_batched((2, 3), True, True, True)
|
||||
|
||||
def test_batched_AB(self):
|
||||
self.run_batched((3,), True, True, False)
|
||||
self.run_batched((2, 3), True, True, False)
|
||||
|
||||
def test_batched_AC(self):
|
||||
self.run_batched((3,), True, False, True)
|
||||
self.run_batched((2, 3), True, False, True)
|
||||
|
||||
def test_batched_BC(self):
|
||||
self.run_batched((3,), False, True, True)
|
||||
self.run_batched((2, 3), False, True, True)
|
||||
|
||||
def test_batched_A(self):
|
||||
self.run_batched((3,), True, False, False)
|
||||
self.run_batched((2, 3), True, False, False)
|
||||
|
||||
def test_batched_B(self):
|
||||
self.run_batched((3,), False, True, False)
|
||||
self.run_batched((2, 3), False, True, False)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
125
test/python/cutlass/gemm/gemm_f16_sm80.py
Normal file
125
test/python/cutlass/gemm/gemm_f16_sm80.py
Normal file
@ -0,0 +1,125 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Low-level functionality tests for GEMM with F16 operands on SM80
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class GemmF16Sm80(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class GemmF16Sm80StreamK(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f16, cc=cc, cluster_shape=[1, 1, 1])
|
||||
|
||||
# Tests using TensorOp
|
||||
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 32], warp_count=[2, 1, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=5)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
|
||||
# Tests using SIMT
|
||||
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
|
||||
|
||||
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
|
||||
# Stream K tests
|
||||
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
|
||||
add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=5)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
140
test/python/cutlass/gemm/gemm_f16_sm90.py
Normal file
140
test/python/cutlass/gemm/gemm_f16_sm90.py
Normal file
@ -0,0 +1,140 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Low-level functionality tests for GEMM with F16 operands on SM90
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 90
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
|
||||
class GemmF16Sm90(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
add_test_specialized = partial(add_test_gemm, cls=GemmF16Sm90, element=cutlass.DataType.f16,
|
||||
warp_count=None, compilation_modes=['nvcc'])
|
||||
|
||||
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
|
||||
# Tests with 1x1x1 clusters
|
||||
add_test_unit_cluster = partial(add_test_tensorop, cluster_shape=[1, 1, 1])
|
||||
add_test_unit_cluster(layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=3)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], stages=5)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
|
||||
|
||||
# Tests with different cluster shapes
|
||||
add_test_cluster_shape = partial(add_test_tensorop, threadblock_shape=[64, 128, 64], stages=None)
|
||||
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, cluster_shape=[2, 2, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.TNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.NTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.NNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[1, 4, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 4, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 1, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 2, 1])
|
||||
|
||||
# Tests for different schedule modes
|
||||
add_test_schedule = partial(add_test_specialized, layouts=LayoutCombination.TTN, alignments=[8, 8, 4],
|
||||
element_output=cutlass.DataType.f32, element_accumulator=cutlass.DataType.f32,
|
||||
opclass=cutlass.OpcodeClass.TensorOp, threadblock_shape=[128, 128, 64], stages=None)
|
||||
add_test_schedule(
|
||||
cluster_shape=[1, 1, 1],
|
||||
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
|
||||
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
|
||||
)
|
||||
add_test_schedule(
|
||||
cluster_shape=[1, 1, 1],
|
||||
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
|
||||
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
|
||||
)
|
||||
add_test_schedule(
|
||||
cluster_shape=[2, 1, 1],
|
||||
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
|
||||
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
|
||||
)
|
||||
add_test_schedule(
|
||||
cluster_shape=[2, 1, 1],
|
||||
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
|
||||
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
|
||||
)
|
||||
|
||||
# Tests using SIMT
|
||||
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt, alignments=[1, 1, 1], cluster_shape=[1, 1, 1], stages=2)
|
||||
add_test_simt(layouts=LayoutCombination.NNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8])
|
||||
add_test_simt(layouts=LayoutCombination.TNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8])
|
||||
add_test_simt(layouts=LayoutCombination.NTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 8])
|
||||
add_test_simt(layouts=LayoutCombination.TTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 8])
|
||||
add_test_simt(layouts=LayoutCombination.NNT, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
100
test/python/cutlass/gemm/gemm_f32_sm80.py
Normal file
100
test/python/cutlass/gemm/gemm_f32_sm80.py
Normal file
@ -0,0 +1,100 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Low-level functionality tests for GEMM with F32 operands on SM80
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class GemmF32Sm80(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class GemmF32Sm80StreamK(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f32, cc=cc, cluster_shape=[1, 1, 1])
|
||||
|
||||
# Tests using TensorOp
|
||||
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
|
||||
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 32], warp_count=[1, 1, 1], stages=4)
|
||||
# Tests using SIMT
|
||||
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
|
||||
|
||||
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
|
||||
# Stream K tests
|
||||
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
|
||||
add_test_streamk(cls=GemmF32Sm80StreamK, layouts=LayoutCombination.TTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
99
test/python/cutlass/gemm/gemm_f64_sm80.py
Normal file
99
test/python/cutlass/gemm/gemm_f64_sm80.py
Normal file
@ -0,0 +1,99 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Low-level functionality tests for GEMM with F64 operands on SM80
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class GemmF64Sm80(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class GemmF64Sm80StreamK(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f64, cc=cc, cluster_shape=[1, 1, 1])
|
||||
|
||||
# Tests using TensorOp
|
||||
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
|
||||
add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 64, 16], warp_count=[2, 2, 1], stages=4)
|
||||
add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 32, 32, 16], warp_count=[2, 1, 1], stages=5)
|
||||
|
||||
# Tests using SIMT
|
||||
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
|
||||
|
||||
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
|
||||
# Stream K tests
|
||||
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
|
||||
add_test_streamk(cls=GemmF64Sm80StreamK, layouts=LayoutCombination.NTT, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
69
test/python/cutlass/gemm/gemm_f64_sm90.py
Normal file
69
test/python/cutlass/gemm/gemm_f64_sm90.py
Normal file
@ -0,0 +1,69 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Low-level functionality tests for GEMM with F64 operands on SM90
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 90
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
|
||||
class GemmF64Sm90(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
add_test_specialized = partial(add_test_gemm, cls=GemmF64Sm90, alignments=[1, 1, 1], cluster_shape=[1, 1, 1],
|
||||
element=cutlass.DataType.f64, element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, compilation_modes=['nvcc'])
|
||||
|
||||
add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.NNT, threadblock_shape=[128, 128, 32], stages=3)
|
||||
add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.TNN, threadblock_shape=[128, 128, 32], stages=3)
|
||||
add_test_specialized( opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.NNN, threadblock_shape=[128, 128, 8], stages=2)
|
||||
add_test_specialized( opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.TTT, threadblock_shape=[ 64, 128, 8], stages=2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
99
test/python/cutlass/gemm/gemm_s8_sm80.py
Normal file
99
test/python/cutlass/gemm/gemm_s8_sm80.py
Normal file
@ -0,0 +1,99 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Low-level functionality tests for GEMM with S8 operands on SM80
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class GemmS8Sm80(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class GemmS8Sm80StreamK(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.s8, cc=cc, cluster_shape=[1, 1, 1])
|
||||
|
||||
# Tests using TensorOp
|
||||
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
|
||||
add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[256, 128, 64], warp_count=[4, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 4], element_output=cutlass.DataType.s32,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=4)
|
||||
|
||||
# Tests using SIMT
|
||||
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
|
||||
|
||||
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.s32,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.s32,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
|
||||
# Stream K tests
|
||||
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
|
||||
add_test_streamk(cls=GemmS8Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
95
test/python/cutlass/gemm/gemm_s8_sm90.py
Normal file
95
test/python/cutlass/gemm/gemm_s8_sm90.py
Normal file
@ -0,0 +1,95 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Low-level functionality tests for GEMM with S8 operands on SM90
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 90
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
|
||||
class GemmS8Sm90(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
add_test_specialized = partial(add_test_gemm, cls=GemmS8Sm90, element=cutlass.DataType.s8, compilation_modes=['nvcc'])
|
||||
|
||||
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
|
||||
# Tests with 1x1x1 clusters
|
||||
add_test_tensorop(layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=3)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 8], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 128, 128], stages=None)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 64, 32], stages=None)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[ 4, 4, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
|
||||
|
||||
# Tests with different cluster shapes
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 2, 1], threadblock_shape=[128, 128, 128], stages=None)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 4, 1], threadblock_shape=[128, 128, 128], stages=None)
|
||||
|
||||
# Tests with warp-specialized ping-pong schedule
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 1, 1], threadblock_shape=[128, 128, 128], stages=None,
|
||||
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
|
||||
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized)
|
||||
|
||||
# Tests for SIMT
|
||||
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
|
||||
add_test_simt(layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 32, 8], stages=2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
387
test/python/cutlass/gemm/gemm_testbed.py
Normal file
387
test/python/cutlass/gemm/gemm_testbed.py
Normal file
@ -0,0 +1,387 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
from math import prod
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
import torch
|
||||
|
||||
from cutlass import (
|
||||
DataType,
|
||||
DataTypeSize,
|
||||
GemmUniversalMode,
|
||||
LayoutType,
|
||||
OpcodeClass,
|
||||
ShortDataTypeNames,
|
||||
SwizzlingFunctor
|
||||
)
|
||||
|
||||
from cutlass.backend import compiler
|
||||
from cutlass.backend.gemm_operation import GemmArguments, GemmOperationUniversal
|
||||
from cutlass.backend.memory_manager import get_allocated_size
|
||||
from cutlass.backend.reduction_operation import ReductionArguments, ReductionOperation
|
||||
from cutlass.shape import GemmCoord, MatrixCoord
|
||||
from cutlass.utils.datatypes import torch_type
|
||||
|
||||
|
||||
class GemmUniversalLauncher:
|
||||
def __init__(
|
||||
self,
|
||||
operation,
|
||||
seed=2080,
|
||||
verification=True,
|
||||
iterations=500,
|
||||
compiler_mode= "nvcc",
|
||||
**kwargs,
|
||||
) -> None:
|
||||
# Create the reduction kernel, if needed
|
||||
self.reduction_operation: ReductionOperation = ReductionOperation(
|
||||
shape=MatrixCoord(4, 32 * operation.C.alignment),
|
||||
C=operation.C,
|
||||
element_accumulator=operation.tile_description.math_instruction.element_accumulator,
|
||||
element_compute=operation.epilogue_functor.element_epilogue,
|
||||
epilogue_functor=operation.epilogue_functor,
|
||||
count=operation.C.alignment,
|
||||
)
|
||||
|
||||
self.math_operation = operation.tile_description.math_instruction.math_operation
|
||||
self.verification = verification
|
||||
|
||||
if compiler_mode == "nvcc":
|
||||
compiler.nvcc()
|
||||
elif compiler_mode == "nvrtc":
|
||||
compiler.nvrtc()
|
||||
else:
|
||||
raise Exception(f"Unexpected compiler string {compiler_mode}")
|
||||
|
||||
op_list = [operation]
|
||||
if operation.arch < 90:
|
||||
# Split K via Python is currently only supported for pre-SM90 kernels
|
||||
op_list.append(self.reduction_operation)
|
||||
|
||||
compiler.add_module(op_list, bypass_cache=False)
|
||||
|
||||
self.operation = operation
|
||||
|
||||
self.dtype_A = torch_type(operation.A.element)
|
||||
self.dtype_B = torch_type(operation.B.element)
|
||||
self.dtype_C = torch_type(operation.C.element)
|
||||
self.dtype_D = torch_type(operation.C.element)
|
||||
|
||||
accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
|
||||
element_size = DataTypeSize[operation.A.element]
|
||||
|
||||
if element_size == 1:
|
||||
self.rand_max = 1
|
||||
self.rand_min = 0
|
||||
elif element_size <= 8:
|
||||
self.rand_max = 1
|
||||
self.rand_min = -1
|
||||
elif element_size == 16:
|
||||
self.rand_max = 4
|
||||
self.rand_min = -4
|
||||
else:
|
||||
self.rand_max = 8
|
||||
self.rand_min = -8
|
||||
|
||||
self.seed = seed
|
||||
|
||||
self.compute_type = operation.epilogue_functor.element_epilogue
|
||||
self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
|
||||
|
||||
def print_problem_size(self, p, mode, batch_count):
|
||||
if mode == GemmUniversalMode.Gemm:
|
||||
mode = "Gemm"
|
||||
elif mode == GemmUniversalMode.Batched:
|
||||
mode = "GemmBatched"
|
||||
elif mode == GemmUniversalMode.GemmSplitKParallel:
|
||||
mode = "GemmSplitKParallel"
|
||||
print(f"problem: {p.m}, {p.n}, {p.k}\n batch_count: {batch_count}\n mode: {mode}")
|
||||
|
||||
def uniform_init(self, shape, dtype, layout):
|
||||
size = prod(shape)
|
||||
if dtype.is_floating_point:
|
||||
data = torch.ceil(torch.empty(size=(size,), dtype=dtype, device="cuda").uniform_(self.rand_min - 0.5, self.rand_max - 0.5))
|
||||
else:
|
||||
# PyTorch does not currently support integer-typed matrix multiplications on GPU.
|
||||
# Fall back to CPU for integer type references.
|
||||
data = torch.empty(size=(size,), dtype=dtype, device="cpu").random_(self.rand_min, self.rand_max + 1)
|
||||
|
||||
if dtype == torch.float64 or dtype == torch.float32:
|
||||
data = data.to("cpu")
|
||||
|
||||
data_ref = data.reshape(shape)
|
||||
|
||||
if layout == LayoutType.RowMajor:
|
||||
data_cutlass = data_ref
|
||||
else:
|
||||
data_cutlass = data_ref.transpose(-1, -2).contiguous()
|
||||
|
||||
data_cutlass = data_cutlass.to("cuda")
|
||||
return data_cutlass, data_ref
|
||||
|
||||
def reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
|
||||
# If any tensor is on CPU, place all tensors on CPU unless only
|
||||
# tensor C is on CPU
|
||||
devices = [x.device.type for x in [tensor_A, tensor_B, tensor_C]]
|
||||
if "cpu" in devices and devices != ["cuda", "cuda", "cpu"]:
|
||||
device = torch.device("cpu")
|
||||
else:
|
||||
device = tensor_A.device
|
||||
|
||||
tensor_A = tensor_A.to(device)
|
||||
tensor_B = tensor_B.to(device)
|
||||
tensor_C = tensor_C.to(device)
|
||||
|
||||
dtype = torch_type(self.compute_type)
|
||||
alpha_torch = torch.tensor([alpha], device=device).to(dtype)
|
||||
beta_torch = torch.tensor([beta], device=device).to(dtype)
|
||||
|
||||
tmp = tensor_A @ tensor_B
|
||||
tensor_D_ref = (alpha_torch * tmp) + (tensor_C * beta_torch)
|
||||
return tensor_D_ref.to(self.dtype_D)
|
||||
|
||||
def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
|
||||
torch.random.manual_seed(self.seed)
|
||||
|
||||
# Assign an actual batch count in cases where we are not running in batched mode.
|
||||
# This is to differentiate between the number of split K slices and the batch count,
|
||||
# which are overloaded within the single `batch_count` variable.
|
||||
if mode == GemmUniversalMode.Batched:
|
||||
true_batch_count = batch_count
|
||||
else:
|
||||
true_batch_count = 1
|
||||
|
||||
def transpose(layout):
|
||||
if layout == LayoutType.RowMajor:
|
||||
return LayoutType.ColumnMajor
|
||||
else:
|
||||
return LayoutType.RowMajor
|
||||
|
||||
tensor_A, tensor_A_ref = self.uniform_init(
|
||||
(true_batch_count, problem_size.m, problem_size.k),
|
||||
self.dtype_A,
|
||||
self.operation.A.layout if not self.operation.switched else transpose(self.operation.B.layout),
|
||||
)
|
||||
tensor_B, tensor_B_ref = self.uniform_init(
|
||||
(true_batch_count, problem_size.k, problem_size.n),
|
||||
self.dtype_B,
|
||||
self.operation.B.layout if not self.operation.switched else transpose(self.operation.A.layout),
|
||||
)
|
||||
tensor_C, tensor_C_ref = self.uniform_init(
|
||||
(true_batch_count, problem_size.m, problem_size.n),
|
||||
self.dtype_C,
|
||||
self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
|
||||
)
|
||||
tensor_D = torch.zeros_like(tensor_C)
|
||||
|
||||
if self.compute_type in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
|
||||
alpha = int(alpha)
|
||||
beta = int(beta)
|
||||
|
||||
#
|
||||
# Launch kernel
|
||||
#
|
||||
|
||||
arguments = GemmArguments(
|
||||
operation=self.operation,
|
||||
problem_size=problem_size,
|
||||
A=tensor_A,
|
||||
B=tensor_B,
|
||||
C=tensor_C,
|
||||
D=tensor_D,
|
||||
output_op=self.operation.epilogue_type(alpha, beta),
|
||||
gemm_mode=mode,
|
||||
split_k_slices=split_k_slices,
|
||||
batch=batch_count,
|
||||
)
|
||||
|
||||
if mode == GemmUniversalMode.GemmSplitKParallel:
|
||||
reduction_arguments = ReductionArguments(
|
||||
self.reduction_operation,
|
||||
problem_size=[problem_size.m, problem_size.n],
|
||||
partitions=split_k_slices,
|
||||
workspace=arguments.ptr_D,
|
||||
destination=tensor_D,
|
||||
source=tensor_C,
|
||||
output_op=self.reduction_operation.epilogue_type(alpha, beta),
|
||||
)
|
||||
|
||||
self.operation.run(arguments)
|
||||
|
||||
if mode == GemmUniversalMode.GemmSplitKParallel:
|
||||
self.reduction_operation.run(reduction_arguments)
|
||||
|
||||
passed = True
|
||||
|
||||
if self.verification:
|
||||
if mode == GemmUniversalMode.GemmSplitKParallel:
|
||||
reduction_arguments.sync()
|
||||
else:
|
||||
arguments.sync()
|
||||
tensor_D_ref = self.reference(
|
||||
problem_size,
|
||||
tensor_A_ref,
|
||||
tensor_B_ref,
|
||||
tensor_C_ref,
|
||||
alpha,
|
||||
beta,
|
||||
)
|
||||
|
||||
tensor_D_ref = tensor_D_ref.to('cuda')
|
||||
|
||||
if self.operation.switched or self.operation.C.layout == LayoutType.ColumnMajor:
|
||||
tensor_D = tensor_D.transpose(-1, -2).contiguous()
|
||||
|
||||
passed = tensor_D.equal(tensor_D_ref)
|
||||
|
||||
try:
|
||||
assert passed
|
||||
except AssertionError:
|
||||
self.print_problem_size(problem_size, mode, batch_count)
|
||||
del arguments
|
||||
if mode == GemmUniversalMode.GemmSplitKParallel:
|
||||
del reduction_arguments
|
||||
|
||||
cur_size = get_allocated_size()
|
||||
assert cur_size == 0, f"{cur_size} B of memory were not released after this run"
|
||||
|
||||
return passed
|
||||
|
||||
|
||||
def test_all_gemm(operation: "GemmOperationUniversal", testcase="universal", compilation_mode="nvcc"):
|
||||
passed = True
|
||||
|
||||
minimum_operand_element_size = min(
|
||||
DataTypeSize[operation.A.element], DataTypeSize[operation.B.element]
|
||||
)
|
||||
opcode_class = operation.tile_description.math_instruction.opcode_class
|
||||
|
||||
if opcode_class == OpcodeClass.Simt:
|
||||
alignment = 1
|
||||
else:
|
||||
alignment = 128 // minimum_operand_element_size
|
||||
|
||||
alignment_m = alignment
|
||||
alignment_n = alignment
|
||||
alignment_k = alignment
|
||||
|
||||
# INT8 alignment constraints
|
||||
if opcode_class == OpcodeClass.Simt:
|
||||
A_is_s8 = operation.A.element == DataType.s8
|
||||
B_is_s8 = operation.B.element == DataType.s8
|
||||
|
||||
if A_is_s8 and operation.A.layout == LayoutType.ColumnMajor:
|
||||
alignment_m = 4
|
||||
if B_is_s8 == DataType.s8 and operation.A.layout == LayoutType.RowMajor:
|
||||
alignment_n = 4
|
||||
if A_is_s8 and B_is_s8 and (operation.A.layout == LayoutType.RowMajor or operation.B.layout == LayoutType.ColumnMajor):
|
||||
alignment_k = 4
|
||||
|
||||
threadblock_k = operation.tile_description.threadblock_shape[2]
|
||||
|
||||
assert testcase != "interleaved"
|
||||
|
||||
supports_split_k = operation.arch < 90 and not operation.swizzling_functor == SwizzlingFunctor.StreamK
|
||||
|
||||
if testcase == "multistage":
|
||||
modes = [GemmUniversalMode.Gemm]
|
||||
problem_size_m = [16, 528]
|
||||
problem_size_n = [16, 528]
|
||||
problem_size_k = [
|
||||
threadblock_k,
|
||||
threadblock_k * operation.tile_description.stages
|
||||
+ operation.tile_description.math_instruction.instruction_shape[2],
|
||||
]
|
||||
problem_alpha = [1.0]
|
||||
problem_beta = [0.0]
|
||||
batch_counts = [1]
|
||||
else:
|
||||
modes = [GemmUniversalMode.Gemm]
|
||||
batch_counts = [1, 2, 3, 5, 7]
|
||||
if supports_split_k:
|
||||
modes.append(GemmUniversalMode.GemmSplitKParallel)
|
||||
|
||||
problem_size_m = [alignment_m, 512 - 3 * alignment_m]
|
||||
problem_size_n = [alignment_n, 512 - 2 * alignment_n]
|
||||
if operation.tile_description.stages is None:
|
||||
stages_for_k_calc = 7
|
||||
else:
|
||||
stages_for_k_calc = operation.tile_description.stages
|
||||
problem_size_k = [
|
||||
alignment_k,
|
||||
threadblock_k * stages_for_k_calc - alignment_k,
|
||||
threadblock_k * stages_for_k_calc * 3 - alignment_k,
|
||||
]
|
||||
problem_alpha = [1.0]
|
||||
problem_beta = [2.0]
|
||||
|
||||
testbed = GemmUniversalLauncher(operation, compiler_mode=compilation_mode)
|
||||
|
||||
for mode in modes:
|
||||
for m in problem_size_m:
|
||||
for n in problem_size_n:
|
||||
for k in problem_size_k:
|
||||
for batch_count in batch_counts:
|
||||
for alpha in problem_alpha:
|
||||
for beta in problem_beta:
|
||||
# skip very small K problems
|
||||
if testcase == "universal":
|
||||
if k // batch_count < 2 * threadblock_k:
|
||||
continue
|
||||
|
||||
problem_size = GemmCoord(m, n, k)
|
||||
|
||||
if supports_split_k:
|
||||
split_k_slices = batch_count
|
||||
else:
|
||||
split_k_slices = 1
|
||||
|
||||
overridden_mode = mode
|
||||
if mode == GemmUniversalMode.Gemm and batch_count > 1:
|
||||
overridden_mode = GemmUniversalMode.Batched
|
||||
|
||||
passed = testbed.run(
|
||||
overridden_mode,
|
||||
problem_size,
|
||||
batch_count,
|
||||
split_k_slices,
|
||||
alpha,
|
||||
beta,
|
||||
)
|
||||
|
||||
if not passed:
|
||||
return False
|
||||
|
||||
return passed
|
||||
44
test/python/cutlass/gemm/run_all_tests.py
Normal file
44
test/python/cutlass/gemm/run_all_tests.py
Normal file
@ -0,0 +1,44 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import pathlib
|
||||
import unittest
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
loader = unittest.TestLoader()
|
||||
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
|
||||
tests = loader.discover(script_dir, 'gemm_*.py')
|
||||
testRunner = unittest.runner.TextTestRunner()
|
||||
results = testRunner.run(tests)
|
||||
if not results.wasSuccessful():
|
||||
raise Exception('Test cases failed')
|
||||
239
test/python/cutlass/gemm/utils.py
Normal file
239
test/python/cutlass/gemm/utils.py
Normal file
@ -0,0 +1,239 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import cutlass
|
||||
|
||||
from cutlass import (
|
||||
DataTypeNames,
|
||||
EpilogueScheduleSuffixes,
|
||||
KernelScheduleSuffixes,
|
||||
LayoutType,
|
||||
OpcodeClassNames,
|
||||
ShortDataTypeNames,
|
||||
ShortLayoutTypeNames
|
||||
)
|
||||
from cutlass.backend import library
|
||||
from cutlass.backend.utils.software import SubstituteTemplate
|
||||
|
||||
from gemm_testbed import test_all_gemm
|
||||
|
||||
|
||||
class Layout:
|
||||
"""
|
||||
Utility class to map transpose and non-transpose terminology to row- and column-major terminology
|
||||
"""
|
||||
|
||||
T = LayoutType.RowMajor
|
||||
N = LayoutType.ColumnMajor
|
||||
|
||||
|
||||
class LayoutCombination:
|
||||
"""
|
||||
Utility class defining all combinations of row- and column-major layouts for operands to a GEMMs
|
||||
"""
|
||||
|
||||
NNN = (Layout.N, Layout.N, Layout.N)
|
||||
NNT = (Layout.N, Layout.N, Layout.T)
|
||||
NTN = (Layout.N, Layout.T, Layout.N)
|
||||
NTT = (Layout.N, Layout.T, Layout.T)
|
||||
TNN = (Layout.T, Layout.N, Layout.N)
|
||||
TNT = (Layout.T, Layout.N, Layout.T)
|
||||
TTN = (Layout.T, Layout.T, Layout.N)
|
||||
TTT = (Layout.T, Layout.T, Layout.T)
|
||||
|
||||
|
||||
def get_name(
|
||||
layouts,
|
||||
alignments,
|
||||
element_output,
|
||||
element_accumulator,
|
||||
element_epilogue,
|
||||
cluster_shape,
|
||||
threadblock_shape,
|
||||
stages,
|
||||
element_a,
|
||||
element_b,
|
||||
arch,
|
||||
opclass,
|
||||
kernel_schedule=None,
|
||||
epilogue_schedule=None,
|
||||
suffix="",
|
||||
):
|
||||
"""
|
||||
Generates a procedural name for a test case.
|
||||
|
||||
:param layouts: indexable container of layouts of A, B, and C operands
|
||||
:param alignments: indexable container of alignments of A, B, and C operands
|
||||
:param element_output: data type of the output element
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:param element_epilogue: data type used in computing the epilogue
|
||||
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
|
||||
:param threadblock_shape: indexable container of dimensions of threadblock tiles
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param element_a: data type of operand A
|
||||
:param element_b: data type of operand B
|
||||
:param arch: compute capability of kernel being generated
|
||||
:type arch: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpcodeClass
|
||||
:param kernel_schedule: kernel_schedule type
|
||||
:type kernel_schedule: cutlass.KernelScheduleType
|
||||
:param epilogue_schedule: epilogue_schedule type
|
||||
:type epilogue_schedule: cutlass.EpilogueScheduleType
|
||||
:param suffix: additional string to add to the suffix of the name
|
||||
:type suffix: str
|
||||
|
||||
:return: str
|
||||
"""
|
||||
name_format = "test_SM${arch}_Device_Gemm_${eA}${lA}_${eB}${lB}_${eC}${lC}_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${cM}x${cN}x${cK}_${stages}_align${aA}-${aB}-${aC}${k}${e}${suffix}"
|
||||
return SubstituteTemplate(
|
||||
name_format,
|
||||
{
|
||||
"arch": str(arch),
|
||||
"eA": DataTypeNames[element_a],
|
||||
"eB": DataTypeNames[element_b],
|
||||
"eC": DataTypeNames[element_output],
|
||||
"lA": ShortLayoutTypeNames[layouts[0]],
|
||||
"lB": ShortLayoutTypeNames[layouts[1]],
|
||||
"lC": ShortLayoutTypeNames[layouts[2]],
|
||||
"opclass": OpcodeClassNames[opclass],
|
||||
"acc": DataTypeNames[element_accumulator],
|
||||
"cM": str(cluster_shape[0]),
|
||||
"cN": str(cluster_shape[1]),
|
||||
"cK": str(cluster_shape[2]),
|
||||
"tbM": str(threadblock_shape[0]),
|
||||
"tbN": str(threadblock_shape[1]),
|
||||
"tbK": str(threadblock_shape[2]),
|
||||
"stages": str(stages) if stages is not None else "auto",
|
||||
"aA": str(alignments[0]),
|
||||
"aB": str(alignments[1]),
|
||||
"aC": str(alignments[2]),
|
||||
"k": "" if kernel_schedule is None else KernelScheduleSuffixes[kernel_schedule],
|
||||
"e": "" if epilogue_schedule is None else EpilogueScheduleSuffixes[epilogue_schedule],
|
||||
"suffix": "" if suffix is None else suffix,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def add_test_gemm(
|
||||
cls=None,
|
||||
cc=None,
|
||||
element=None,
|
||||
layouts=None,
|
||||
alignments=None,
|
||||
element_output=None,
|
||||
element_accumulator=None,
|
||||
cluster_shape=None,
|
||||
threadblock_shape=None,
|
||||
warp_count=None,
|
||||
stages=None,
|
||||
opclass=None,
|
||||
swizzle=None,
|
||||
kernel_schedule=None,
|
||||
epilogue_schedule=None,
|
||||
compilation_modes=['nvcc', 'nvrtc']):
|
||||
"""
|
||||
Create test-running functions with the given specification and set it as a method of ``cls``.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param cc: compute capability to compile for
|
||||
:type cc: int
|
||||
:param element: data type of A and B operands
|
||||
:type element: cutlass.DataType.f16
|
||||
:param layouts: layouts of A, B, and C operands
|
||||
:type layouts: list or tuple
|
||||
:param alignments: alingments of A, B, and C operands
|
||||
:type alignments: list or tuple
|
||||
:param element_output: data type of the output element
|
||||
:type element_output: cutlass.DataType
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:type element_accumulator: cutlass.DataType
|
||||
:param cluster_shape: dimensions of clusters
|
||||
:type cluster_shape: list or tuple
|
||||
:param threadblock_shape: dimensions of threadblock tiles
|
||||
:type threadblock_shape: list or tuple
|
||||
:param warp_count: warps to be launched per threadblock dimension
|
||||
:type warp_count: list or tuple
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpcodeClass
|
||||
:param swizzle: threadblock swizzling functor
|
||||
:param kernel_schedule: kernel schedule to use
|
||||
:type kernel_schedule: cutlass.KernelScheduleType
|
||||
:param epilogue_schedule: epilogue schedule to use
|
||||
:type epilogue_schedule: cutlass.EpilogueScheduleType
|
||||
:param compilation_modes: list of compilers to used in testing the kernel (options: 'nvrtc', 'nvcc')
|
||||
:type compilation_modes: list
|
||||
"""
|
||||
|
||||
for compilation_mode in compilation_modes:
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
element_A = element
|
||||
element_B = element
|
||||
layout_A, layout_B, layout_C = layouts
|
||||
alignment_A, alignment_B, alignment_C = alignments
|
||||
|
||||
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
|
||||
element_C=element_output, element_D=element_output,
|
||||
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
|
||||
element_accumulator=element_accumulator,
|
||||
kernel_cc=cc)
|
||||
|
||||
plan.opclass = opclass
|
||||
if swizzle is not None:
|
||||
plan.swizzling_functor = swizzle
|
||||
|
||||
td = plan.tile_descriptions()[0]
|
||||
|
||||
if warp_count is not None:
|
||||
td.warp_count = warp_count
|
||||
td.threadblock_shape = threadblock_shape
|
||||
td.stages = stages
|
||||
td.cluster_shape = cluster_shape
|
||||
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
|
||||
self.assertTrue(test_all_gemm(op, 'universal', compilation_mode=compilation_mode))
|
||||
|
||||
element_epilogue = element_accumulator
|
||||
name = get_name(
|
||||
layouts=layouts, alignments=alignments, element_output=element_output, element_accumulator=element_accumulator,
|
||||
element_epilogue=element_epilogue, cluster_shape=cluster_shape, threadblock_shape=threadblock_shape,
|
||||
stages=stages, element_a=element, element_b=element, arch=cc, opclass=opclass,
|
||||
kernel_schedule=kernel_schedule, epilogue_schedule=epilogue_schedule, suffix=f'_{compilation_mode}')
|
||||
|
||||
setattr(cls, name, run)
|
||||
284
test/python/cutlass/interface/conv2d_interface.py
Normal file
284
test/python/cutlass/interface/conv2d_interface.py
Normal file
@ -0,0 +1,284 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Tests the high-level Conv2d interface
|
||||
"""
|
||||
|
||||
from math import ceil
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
import cutlass.utils.datatypes as datatypes
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
from utils import ExpectException
|
||||
import os
|
||||
|
||||
|
||||
class Conv2dEquivalence:
|
||||
"""
|
||||
Helper class for testing the equivalence of different constructions of the Conv2d interface
|
||||
"""
|
||||
def __init__(self, conv_kind, element_A, element_B, element_C, element_D, element_accumulator,
|
||||
alignment_A, alignment_B, alignment_C):
|
||||
|
||||
self.element_A = element_A
|
||||
self.element_B = element_B
|
||||
self.element_C = element_C
|
||||
self.element_D = element_D
|
||||
self.element_accumulator = element_accumulator
|
||||
self.alignment_A = alignment_A
|
||||
self.alignment_B = alignment_B
|
||||
self.alignment_C = alignment_C
|
||||
|
||||
self.conv_kind = conv_kind
|
||||
|
||||
self.plan = cutlass.op.Conv2d(
|
||||
kind=self.conv_kind, element_A=element_A, element_B=element_B, element_C=element_C,
|
||||
element_D=element_D, element_accumulator=element_accumulator)
|
||||
|
||||
self.op = self.plan.construct(
|
||||
alignment_A=self.alignment_A, alignment_B=self.alignment_B,
|
||||
alignment_C=self.alignment_C)
|
||||
|
||||
def _plans_equal(self, other_plan) -> bool:
|
||||
"""
|
||||
Compares whether two plans are equal
|
||||
|
||||
:param other_plan: plan to compare against the default Conv2d
|
||||
:type other_plan: cutlass.op.Conv2d
|
||||
|
||||
:return: whether `other_plan` is equivalent to `self.plan`
|
||||
:rtype: bool
|
||||
"""
|
||||
other_op = other_plan.construct(
|
||||
alignment_A=self.alignment_A, alignment_B=self.alignment_B,
|
||||
alignment_C=self.alignment_C)
|
||||
|
||||
return self.op.rt_module.emit() == other_op.rt_module.emit()
|
||||
|
||||
def generic_test(self):
|
||||
"""
|
||||
Tests the equivalence of various constructions of the Conv2d interface when using CUTLASS data types
|
||||
and layouts for constructing the Conv2d interface
|
||||
"""
|
||||
if not datatypes.numpy_available:
|
||||
return
|
||||
|
||||
# Test when specifying all parameters
|
||||
plan_other = cutlass.op.Conv2d(
|
||||
kind=self.conv_kind,
|
||||
element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
|
||||
element_D=self.element_D, element_accumulator=self.element_accumulator)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
# Test when specifying all parameters but A
|
||||
plan_other = cutlass.op.Conv2d(
|
||||
kind=self.conv_kind,
|
||||
element_B=self.element_B, element_C=self.element_C,
|
||||
element_D=self.element_D, element_accumulator=self.element_accumulator,
|
||||
element=self.element_A)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
# Test when specifying all parameters but A and B as tensors using generic element and output
|
||||
plan_other = cutlass.op.Conv2d(
|
||||
kind=self.conv_kind,
|
||||
element_C=self.element_C,
|
||||
element_D=self.element_D, element_accumulator=self.element_accumulator,
|
||||
element=self.element_A)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
# Test without explicit accumulator. Only run if the type of C and the accumulator are equal
|
||||
if self.element_C == self.element_accumulator:
|
||||
plan_other = cutlass.op.Conv2d(
|
||||
kind=self.conv_kind,
|
||||
element_C=self.element_C,
|
||||
element_D=self.element_D,
|
||||
element=self.element_A)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
# Test with only the generic types. Only rune if the types of A, B, C, and D are the same
|
||||
if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
|
||||
and self.element_A == self.element_accumulator):
|
||||
plan_other = cutlass.op.Conv2d(kind=self.conv_kind, element=self.element_A)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
def numpy_test(self):
|
||||
"""
|
||||
Tests the equivalence of various constructions of the Conv2d interface when using numpy as a frontend
|
||||
"""
|
||||
if not datatypes.numpy_available:
|
||||
return
|
||||
|
||||
import numpy as np
|
||||
type_A = datatypes.numpy_type(self.element_A)
|
||||
type_B = datatypes.numpy_type(self.element_B)
|
||||
type_C = datatypes.numpy_type(self.element_C)
|
||||
type_D = datatypes.numpy_type(self.element_D)
|
||||
type_accum = datatypes.numpy_type(self.element_accumulator)
|
||||
|
||||
size = (2, 2)
|
||||
A = np.zeros(size, dtype=type_A)
|
||||
B = np.zeros(size, dtype=type_B)
|
||||
C = np.zeros(size, dtype=type_C)
|
||||
D = np.zeros(size, dtype=type_D)
|
||||
|
||||
return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
|
||||
|
||||
def torch_test(self):
|
||||
"""
|
||||
Tests the equivalence of various constructions of the Conv2d interface when using torch as a frontend
|
||||
"""
|
||||
if not datatypes.torch_available:
|
||||
return
|
||||
|
||||
import torch
|
||||
type_A = datatypes.torch_type(self.element_A)
|
||||
type_B = datatypes.torch_type(self.element_B)
|
||||
type_C = datatypes.torch_type(self.element_C)
|
||||
type_D = datatypes.torch_type(self.element_D)
|
||||
type_accum = datatypes.torch_type(self.element_accumulator)
|
||||
|
||||
size = (2, 2)
|
||||
|
||||
A = torch.empty(size, dtype=type_A)
|
||||
B = torch.empty(size, dtype=type_B)
|
||||
C = torch.empty(size, dtype=type_C)
|
||||
D = torch.empty(size, dtype=type_D)
|
||||
|
||||
return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
|
||||
|
||||
def tensor_test(self, type_A, type_B, type_C, type_D, type_accum, A, B, C, D):
|
||||
# Test when specifying all parameters via tensors
|
||||
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D, element_accumulator=type_accum)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
# Test when specifying all parameters but A as tensors
|
||||
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
# Test when specifying all parameters but A and B as tensors and using generic element and output
|
||||
if type_A == type_B:
|
||||
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, C=C, D=D, element_accumulator=type_accum, element=type_A)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
# Test without explicit accumulator. Only run if the type of C and the accumulator.
|
||||
if type_C == type_accum:
|
||||
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
# Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
|
||||
if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum):
|
||||
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, element=type_A)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
def test_all(self):
|
||||
"""
|
||||
Runs all tests on the Gemm interface
|
||||
"""
|
||||
self.generic_test()
|
||||
self.numpy_test()
|
||||
self.torch_test()
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class ConvEquivalenceTest(unittest.TestCase):
|
||||
"""
|
||||
Tests the equivalence of different constructions of the Conv2d interface
|
||||
"""
|
||||
pass
|
||||
|
||||
type2alignment = {
|
||||
cutlass.DataType.f16: 8,
|
||||
cutlass.DataType.f32: 4
|
||||
}
|
||||
|
||||
def add_test(conv_kind, element_A, element_B, element_C, element_D, element_accumulator):
|
||||
|
||||
test_name = f"test_conv2d_{conv_kind}_{element_A}_{element_B}_{element_C}_{element_D}_{element_accumulator}"
|
||||
|
||||
def run(self):
|
||||
conv2d_eq = Conv2dEquivalence(
|
||||
conv_kind=conv_kind,
|
||||
element_A=element_A, element_B=element_B,
|
||||
element_C=element_C, element_D=element_D,
|
||||
element_accumulator=element_accumulator,
|
||||
alignment_A=type2alignment[element_A], alignment_B=type2alignment[element_B],
|
||||
alignment_C=type2alignment[element_C]
|
||||
)
|
||||
conv2d_eq.test_all()
|
||||
|
||||
setattr(ConvEquivalenceTest, test_name, run)
|
||||
|
||||
for conv_kind in ["fprop", "wgrad", "dgrad"]:
|
||||
for types in [
|
||||
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16],
|
||||
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32],
|
||||
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f16],
|
||||
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32],
|
||||
[cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32]
|
||||
]:
|
||||
add_test(conv_kind, types[0], types[1], types[2], types[3], types[4])
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class Conv2dErrorTests(unittest.TestCase):
|
||||
"""
|
||||
Tests various error scenarios that arise with the high-level Gemm interface
|
||||
"""
|
||||
|
||||
def test_alignment(self):
|
||||
"""
|
||||
Tests case in which the alignment specified is unsupported
|
||||
"""
|
||||
plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
|
||||
|
||||
with ExpectException(True, 'Alignment 3 is not supported for F16. The construction should fail.'):
|
||||
op = plan.construct(alignment_A=3, alignment_B=3, alignment_C=3)
|
||||
|
||||
def test_invalid_tile_description(self):
|
||||
"""
|
||||
Tests scenarios in which an invalid tile description is provided for a given CC
|
||||
"""
|
||||
plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
|
||||
|
||||
td = plan.tile_descriptions()[0]
|
||||
td.threadblock_shape=[17, 32, 5]
|
||||
|
||||
plan.tile_description = td
|
||||
with ExpectException(True, 'The threadblock shape is invalid. The compilation should fail.'):
|
||||
plan.compile()
|
||||
# Clean up the error message
|
||||
os.remove("./cutlass_python_compilation_device_error.txt")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
245
test/python/cutlass/interface/evt_interface.py
Normal file
245
test/python/cutlass/interface/evt_interface.py
Normal file
@ -0,0 +1,245 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Test the EVT interface
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass import LayoutType, Tensor
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
from cutlass.epilogue import reshape, permute
|
||||
|
||||
from utils import ExpectException
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
|
||||
class EVTErrorTests(unittest.TestCase):
|
||||
"""
|
||||
Tests various error scenarios that arise with the EVT interface
|
||||
"""
|
||||
@unittest.skipIf(device_cc() != 90, "Only Sm90 EVT requires root node be 'D'")
|
||||
def test_root_not_d(self):
|
||||
"""
|
||||
Test when "D" does not exist in Sm90 EVT
|
||||
"""
|
||||
def evt_root_not_d(accum, alpha):
|
||||
F = accum * alpha
|
||||
return F
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"alpha": 1.2,
|
||||
"F": self.fake_tensor(np.float16, (6, 512, 512))
|
||||
}
|
||||
|
||||
with ExpectException(device_cc() == 90,
|
||||
"SyntaxError: Sm90 EVT requires the epilogue to have a returned tensor D, "
|
||||
"but the variable 'D' is not found in the return values.", True):
|
||||
|
||||
cutlass.epilogue.trace(evt_root_not_d, example_tensors)
|
||||
|
||||
def test_no_accum(self):
|
||||
"""
|
||||
Test when "accum" is not in input arguments
|
||||
"""
|
||||
def evt_no_accum(alpha, C):
|
||||
D = alpha * C
|
||||
return D
|
||||
|
||||
example_tensors = {
|
||||
"C": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"alpha": 1.2,
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 512))
|
||||
}
|
||||
|
||||
with ExpectException(True, "SyntaxError: Cannot find 'accum' in the argument list.", True):
|
||||
cutlass.epilogue.trace(evt_no_accum, example_tensors)
|
||||
|
||||
@unittest.skipIf(device_cc() != 90, "Only Sm90 EVT has concern on smem size")
|
||||
def test_too_much_shared_memory(self):
|
||||
"""
|
||||
Test when the epilogue consumes too much shared memory
|
||||
"""
|
||||
def evt_too_much_shared_memory(accum, C1, C2, C3, C4, C5):
|
||||
D1 = accum + C1
|
||||
D2 = D1 + C2
|
||||
D3 = D2 + C3
|
||||
D4 = D3 + C4
|
||||
D = D4 + C5
|
||||
return D, D1, D2, D3, D4
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C1": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C2": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C3": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C4": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C5": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D1": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D2": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D3": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D4": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 512))
|
||||
}
|
||||
|
||||
epilogue_visitor = cutlass.epilogue.trace(evt_too_much_shared_memory, example_tensors)
|
||||
|
||||
plan = cutlass.op.Gemm(
|
||||
element=np.float16, layout=cutlass.LayoutType.RowMajor,
|
||||
element_accumulator=np.float32
|
||||
)
|
||||
|
||||
with ExpectException(True,
|
||||
"RuntimeError: The epilogue consumes too much shared memory. "
|
||||
"No valid tile description is found in the generator.", True):
|
||||
plan.epilogue_visitor = epilogue_visitor
|
||||
|
||||
def test_not_ssa(self):
|
||||
"""
|
||||
Test when the epilogue is not in SSA
|
||||
"""
|
||||
def evt_redefine(accum, C, alpha):
|
||||
F = accum + C
|
||||
F = F * alpha
|
||||
D = F
|
||||
return D, F
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"alpha": 1.5,
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"F": self.fake_tensor(np.float16, (6, 512, 512))
|
||||
}
|
||||
|
||||
with ExpectException(True, "SyntaxError: Variable 'F' cannot be defined twice.", True):
|
||||
cutlass.epilogue.trace(evt_redefine, example_tensors)
|
||||
|
||||
def evt_undefine(accum, alpha):
|
||||
F = accum + C
|
||||
D = F * alpha
|
||||
return D, F
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"alpha": 1.5,
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"F": self.fake_tensor(np.float16, (6, 512, 512))
|
||||
}
|
||||
|
||||
with ExpectException(True, "SyntaxError: Variable 'C' is undefined.", True):
|
||||
cutlass.epilogue.trace(evt_undefine, example_tensors)
|
||||
|
||||
def test_missing_example_tensor(self):
|
||||
"""
|
||||
Test when the example tensor of an input/output variable is not provided
|
||||
"""
|
||||
def evt_missing_example_tensor(accum, C):
|
||||
D = accum + C
|
||||
return D
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
}
|
||||
|
||||
with ExpectException(True, "RuntimeError: Example input for D is not provided.", True):
|
||||
cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
}
|
||||
|
||||
with ExpectException(True, "RuntimeError: Example input for C is not provided.", True):
|
||||
cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
|
||||
|
||||
def test_return_expression(self):
|
||||
"""
|
||||
Test when the return value is an expression
|
||||
"""
|
||||
def evt_return_expr(accum, C):
|
||||
return accum + C
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
}
|
||||
|
||||
with ExpectException(True, "SyntaxError: Return value cannot be an expression", True):
|
||||
cutlass.epilogue.trace(evt_return_expr, example_tensors)
|
||||
|
||||
def test_incompatible_shape(self):
|
||||
"""
|
||||
Test when the shape of example tensors are incompatible
|
||||
"""
|
||||
def evt_incompatible_shape(accum, C):
|
||||
D = accum + C
|
||||
return D
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 256, 512)),
|
||||
"C": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 512))
|
||||
}
|
||||
|
||||
with ExpectException(True,
|
||||
"RuntimeError: Dimension mismatch between accum(6, 256, 512), C(6, 512, 512).", True):
|
||||
cutlass.epilogue.trace(evt_incompatible_shape, example_tensors)
|
||||
|
||||
def test_no_matching_impl(self):
|
||||
def evt_no_matching_impl(accum, bias):
|
||||
D = accum + reshape(permute(bias, indices=(1, 0)), new_shape=(512, 1))
|
||||
return D
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 256)),
|
||||
"bias": self.fake_tensor(np.float16, (16, 32)),
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 256))
|
||||
}
|
||||
|
||||
with ExpectException(True, "NotImplementedError: No matching op for node bias with stride (0, (1, 32), 0).", True):
|
||||
cutlass.epilogue.trace(evt_no_matching_impl, example_tensors)
|
||||
#
|
||||
# Helper functions
|
||||
#
|
||||
|
||||
def fake_tensor(self, element, shape):
|
||||
return Tensor(element=element, shape=shape, layout_tag=LayoutType.RowMajor)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
351
test/python/cutlass/interface/gemm_interface.py
Normal file
351
test/python/cutlass/interface/gemm_interface.py
Normal file
@ -0,0 +1,351 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Tests the high-level GEMM interface
|
||||
"""
|
||||
|
||||
from math import ceil
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
import cutlass.utils.datatypes as datatypes
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
from utils import ExpectException
|
||||
|
||||
|
||||
class GemmEquivalence:
|
||||
"""
|
||||
Helper class for testing the equivalence of different constructions of the Gemm interface
|
||||
"""
|
||||
def __init__(self, element_A, element_B, element_C, element_D, element_accumulator,
|
||||
layout_A, layout_B, layout_C, alignment_A, alignment_B, alignment_C):
|
||||
self.element_A = element_A
|
||||
self.element_B = element_B
|
||||
self.element_C = element_C
|
||||
self.element_D = element_D
|
||||
self.element_accumulator = element_accumulator
|
||||
self.layout_A = layout_A
|
||||
self.layout_B = layout_B
|
||||
self.layout_C = layout_C
|
||||
self.alignment_A = alignment_A
|
||||
self.alignment_B = alignment_B
|
||||
self.alignment_C = alignment_C
|
||||
self.plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B, element_C=element_C,
|
||||
element_D=element_D, element_accumulator=element_accumulator,
|
||||
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C)
|
||||
self.op = self.plan.construct(alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
|
||||
|
||||
def _plans_equal(self, other_plan) -> bool:
|
||||
"""
|
||||
Compares whether two plans are equal
|
||||
|
||||
:param other_plan: plan to compare against the default GEMM
|
||||
:type other_plan: cutlass.op.Gemm
|
||||
|
||||
:return: whether `other_plan` is equivalent to `self.plan`
|
||||
:rtype: bool
|
||||
"""
|
||||
other_op = other_plan.construct(alignment_A=self.alignment_A, alignment_B=self.alignment_B, alignment_C=self.alignment_C)
|
||||
|
||||
# Compare whether the operations are equal by comparing the C++ code that would be emitted for them
|
||||
return self.op.rt_module.emit() == other_op.rt_module.emit()
|
||||
|
||||
def generic_test(self):
|
||||
"""
|
||||
Tests the equivalence of various constructions of the Gemm interface when using CUTLASS data types
|
||||
and layouts for constructing the Gemm interface
|
||||
"""
|
||||
if not datatypes.numpy_available:
|
||||
return
|
||||
|
||||
# Test when specifying all parameters
|
||||
plan_other = cutlass.op.Gemm(element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
|
||||
element_D=self.element_D, element_accumulator=self.element_accumulator,
|
||||
layout_A=self.layout_A, layout_B=self.layout_B, layout_C=self.layout_C)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
# Test when specifying all parameters but A
|
||||
plan_other = cutlass.op.Gemm(element_B=self.element_B, element_C=self.element_C,
|
||||
element_D=self.element_D, element_accumulator=self.element_accumulator,
|
||||
layout_B=self.layout_B, layout_C=self.layout_C,
|
||||
element=self.element_A, layout=self.layout_A)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
# Test when specifying all parameters but A and B as tensors and using generic element and output
|
||||
# Only run this test if the layouts and types for A and B are equal.
|
||||
if self.element_A == self.element_B and self.layout_A == self.layout_B:
|
||||
plan_other = cutlass.op.Gemm(element_C=self.element_C, element_D=self.element_D, element_accumulator=self.element_accumulator,
|
||||
layout_C=self.layout_C, element=self.element_A, layout=self.layout_A)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
# Test without explicit accumulator. Only run if the type of C and the accumulator.
|
||||
if self.element_C == self.element_accumulator:
|
||||
plan_other = cutlass.op.Gemm(element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
|
||||
element_D=self.element_D, layout_A=self.layout_A, layout_B=self.layout_B,
|
||||
layout_C=self.layout_C)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
# Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
|
||||
if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
|
||||
and self.element_A == self.element_accumulator and
|
||||
self.layout_A == self.layout_B and self.layout_A == self.layout_C):
|
||||
plan_other = cutlass.op.Gemm(element=self.element_A, layout=self.layout_A)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
def numpy_test(self):
|
||||
"""
|
||||
Tests the equivalence of various constructions of the Gemm interface when using numpy as a frontend
|
||||
"""
|
||||
if not datatypes.numpy_available:
|
||||
return
|
||||
|
||||
import numpy as np
|
||||
type_A = datatypes.numpy_type(self.element_A)
|
||||
type_B = datatypes.numpy_type(self.element_B)
|
||||
type_C = datatypes.numpy_type(self.element_C)
|
||||
type_D = datatypes.numpy_type(self.element_D)
|
||||
type_accum = datatypes.numpy_type(self.element_accumulator)
|
||||
|
||||
layout_to_order = {
|
||||
cutlass.LayoutType.RowMajor: 'C',
|
||||
cutlass.LayoutType.ColumnMajor: 'F'
|
||||
}
|
||||
size = (2, 2)
|
||||
A = np.zeros(size, order=layout_to_order[self.layout_A], dtype=type_A)
|
||||
B = np.zeros(size, order=layout_to_order[self.layout_B], dtype=type_B)
|
||||
C = np.zeros(size, order=layout_to_order[self.layout_C], dtype=type_C)
|
||||
D = np.zeros(size, order=layout_to_order[self.layout_C], dtype=type_D)
|
||||
|
||||
# Test when specifying all parameters via tensors
|
||||
plan_np = cutlass.op.Gemm(A=A, B=B, C=C, D=D, element_accumulator=type_accum)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
# Test when specifying all parameters but A as tensors
|
||||
plan_np = cutlass.op.Gemm(B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A, layout_A=self.layout_A)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
# Test when specifying all parameters but A and B as tensors and using generic element and output
|
||||
# Only run this test if the layouts and types for A and B are equal.
|
||||
if type_A == type_B and self.layout_A == self.layout_B:
|
||||
plan_np = cutlass.op.Gemm(C=C, D=D, element_accumulator=type_accum, element=type_A, layout=self.layout_A)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
# Test without explicit accumulator. Only run if the type of C and the accumulator.
|
||||
if type_C == type_accum:
|
||||
plan_np = cutlass.op.Gemm(A=A, B=B, C=C, D=D)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
# Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
|
||||
if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum and
|
||||
self.layout_A == self.layout_B and self.layout_A == self.layout_C):
|
||||
plan_np = cutlass.op.Gemm(element=type_A, layout=self.layout_A)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
def test_all(self):
|
||||
"""
|
||||
Runs all tests on the Gemm interface
|
||||
"""
|
||||
self.generic_test()
|
||||
self.numpy_test()
|
||||
|
||||
|
||||
class GemmEquivalenceTest(unittest.TestCase):
|
||||
"""
|
||||
Tests the equivalence of different constructions of the Gemm interface
|
||||
"""
|
||||
@unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
|
||||
def test_gemm_equivalence_f16_f16_f16_f16_f16_ttt_8_8_8(self):
|
||||
gemm_eq = GemmEquivalence(
|
||||
element_A=cutlass.DataType.f16, element_B=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
|
||||
element_D=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16,
|
||||
layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.RowMajor,
|
||||
alignment_A=8, alignment_B=8, alignment_C=8)
|
||||
gemm_eq.test_all()
|
||||
|
||||
@unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
|
||||
def test_gemm_equivalence_f16_f16_f16_f16_f32_ntn_8_8_8(self):
|
||||
gemm_eq = GemmEquivalence(
|
||||
element_A=cutlass.DataType.f16, element_B=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
|
||||
element_D=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32,
|
||||
layout_A=cutlass.LayoutType.ColumnMajor, layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.ColumnMajor,
|
||||
alignment_A=8, alignment_B=8, alignment_C=8)
|
||||
gemm_eq.test_all()
|
||||
|
||||
@unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
|
||||
def test_gemm_equivalence_f16_f16_f16_f16_f16_ttt_4_4_4(self):
|
||||
gemm_eq = GemmEquivalence(
|
||||
element_A=cutlass.DataType.f16, element_B=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
|
||||
element_D=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16,
|
||||
layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.RowMajor,
|
||||
alignment_A=8, alignment_B=8, alignment_C=8)
|
||||
gemm_eq.test_all()
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for F64 Tensor Core tests.")
|
||||
def test_gemm_equivalence_f64_f64_f64_f64_f64_tnt_1_1_1(self):
|
||||
gemm_eq = GemmEquivalence(
|
||||
element_A=cutlass.DataType.f64, element_B=cutlass.DataType.f64, element_C=cutlass.DataType.f64,
|
||||
element_D=cutlass.DataType.f64, element_accumulator=cutlass.DataType.f64,
|
||||
layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.ColumnMajor, layout_C=cutlass.LayoutType.RowMajor,
|
||||
alignment_A=1, alignment_B=1, alignment_C=1)
|
||||
gemm_eq.test_all()
|
||||
|
||||
|
||||
class GemmErrorTests(unittest.TestCase):
|
||||
"""
|
||||
Tests various error scenarios that arise with the high-level Gemm interface
|
||||
"""
|
||||
|
||||
def test_alignment(self):
|
||||
"""
|
||||
Tests case in which the alignment specified is unsupported
|
||||
"""
|
||||
plan = cutlass.op.Gemm(element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
|
||||
|
||||
with ExpectException(True, 'Alignment 16 is not supported for F16. The construction should fail.'):
|
||||
op = plan.construct(alignment_A=16, alignment_B=16, alignment_C=16)
|
||||
|
||||
def test_tensorop_availability(self):
|
||||
"""
|
||||
Tests case in which only SIMT operations are available but TensorOp is requested
|
||||
"""
|
||||
cc = device_cc()
|
||||
|
||||
# F64 Tensor Core operations are only avaiable on devices with CC >= 80
|
||||
supports_tensorop_f64 = cc >= 80
|
||||
plan = cutlass.op.Gemm(cc=cc, element=cutlass.DataType.f64, layout=cutlass.LayoutType.RowMajor)
|
||||
|
||||
error_msg = f'Incorrectly raised an exception for availability of TensorOp with F64 operands on SM{cc}'
|
||||
with ExpectException(not supports_tensorop_f64, error_msg):
|
||||
plan.opclass = cutlass.OpcodeClass.TensorOp
|
||||
|
||||
expected_opclass = cutlass.OpcodeClass.TensorOp if supports_tensorop_f64 else cutlass.OpcodeClass.Simt
|
||||
assert plan.opclass == expected_opclass, f'Expected opclass to be {expected_opclass}, but received {plan.opclass} for SM{cc}'
|
||||
|
||||
@unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for F16 Tensor Core tests.")
|
||||
def test_opclass_switch(self):
|
||||
"""
|
||||
Tests cases in which the opcode class in question is switched (e.g., from TensorOp to SIMT)
|
||||
"""
|
||||
plan = cutlass.op.Gemm( element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
|
||||
assert plan.opclass == cutlass.OpcodeClass.TensorOp
|
||||
|
||||
# Ensure that all tile descriptions have opclass of TensorOp
|
||||
for td in plan.tile_descriptions():
|
||||
assert td.math_instruction.opcode_class == cutlass.OpcodeClass.TensorOp
|
||||
|
||||
plan.opclass = cutlass.OpcodeClass.Simt
|
||||
|
||||
# Ensure that all tile descriptions have opclass of Simt
|
||||
for td in plan.tile_descriptions():
|
||||
assert td.math_instruction.opcode_class == cutlass.OpcodeClass.Simt
|
||||
|
||||
def test_invalid_tile_description(self):
|
||||
"""
|
||||
Tests scenarios in which an invalid tile description is provided for a given CC
|
||||
"""
|
||||
cc = device_cc()
|
||||
plan = cutlass.op.Gemm(cc=cc, element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
|
||||
td = plan.tile_descriptions()[0]
|
||||
stages = td.stages
|
||||
|
||||
# Zero stage count is valid for SM90+, as this is used to indicate that the builder's auto stage
|
||||
# count should be used
|
||||
with ExpectException(cc < 90, f'Requested zero stages'):
|
||||
td.stages = 0
|
||||
plan.construct(td)
|
||||
|
||||
if cc < 90:
|
||||
with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
|
||||
td.stages = 3
|
||||
plan.construct(td)
|
||||
else:
|
||||
original_kschedule = td.kernel_schedule
|
||||
original_eschedule = td.epilogue_schedule
|
||||
with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
|
||||
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
|
||||
td.epilogue_schedule = cutlass.EpilogueScheduleType.NoSmemWarpSpecialized
|
||||
td.stages = 3
|
||||
plan.construct(td)
|
||||
|
||||
# Reset schedules
|
||||
td.kernel_schedule = original_kschedule
|
||||
td.epilogue_schedule = original_eschedule
|
||||
|
||||
with ExpectException(True, f'Requested too many stages'):
|
||||
td.stages = 100
|
||||
plan.construct(td)
|
||||
|
||||
# Reset stage count
|
||||
td.stages = stages
|
||||
|
||||
cluster_shape = td.cluster_shape
|
||||
with ExpectException(cc < 90, f'Requested non-unit cluster shape on SM{cc}'):
|
||||
td.cluster_shape = [2, 1, 1]
|
||||
plan.construct(td)
|
||||
|
||||
# Reset cluster shape
|
||||
td.cluster_shape = cluster_shape
|
||||
|
||||
with ExpectException(cc < 90, f'Requested a non-auto schedule on SM{cc}'):
|
||||
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
|
||||
td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
|
||||
plan.construct(td)
|
||||
|
||||
with ExpectException(True, f'Requested a non-auto kernel schedule with an auto epilogue schedule'):
|
||||
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
|
||||
td.epilogue_schedule = cutlass.EpilogueScheduleType.ScheduleAuto
|
||||
plan.construct(td)
|
||||
|
||||
with ExpectException(True, f'Requested an auto kernel schedule with a non-auto epilogue schedule'):
|
||||
td.kernel_schedule = cutlass.KernelScheduleType.ScheduleAuto
|
||||
td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
|
||||
plan.construct(td)
|
||||
|
||||
with ExpectException(cc < 90, f'Requested a tile scheduler on SM{cc}'):
|
||||
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative
|
||||
td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
|
||||
td.tile_scheduler = cutlass.TileSchedulerType.StreamK
|
||||
plan.construct(td)
|
||||
|
||||
# Ensure that all returned tile descriptions are unique
|
||||
ops = {}
|
||||
for i, td in enumerate(plan.tile_descriptions()):
|
||||
op = plan.construct(td)
|
||||
code_str = op.rt_module.emit()
|
||||
if code_str in ops:
|
||||
conflicting_td = ops[code_str]
|
||||
assert False, f'Multiple tile descriptions emitted {code_str}\nTile descriptions are:\n{td}\n{conflicting_td}'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
69
test/python/cutlass/interface/utils.py
Normal file
69
test/python/cutlass/interface/utils.py
Normal file
@ -0,0 +1,69 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Helper functions & classes for interface test
|
||||
"""
|
||||
class ExpectException:
|
||||
"""
|
||||
Utility class to assert that an exception was raised when expected
|
||||
|
||||
Example:
|
||||
|
||||
.. highlight:: python
|
||||
.. code-block:: python
|
||||
|
||||
with ExceptionExpected(True, 'Division by zero'):
|
||||
x = 1.0 / 0.0
|
||||
|
||||
:param exception_expected: whether an exception is expected to be raised
|
||||
:type exception_expected: bool
|
||||
:param message: message to print if an exception is raised when not expected or vice versa
|
||||
:type message: str
|
||||
"""
|
||||
def __init__(self, exception_expected: bool, message: str = '', verify_msg=False):
|
||||
self.exception_expected = exception_expected
|
||||
self.message = message
|
||||
self.verify_msg = verify_msg
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, traceback):
|
||||
exception_raised = exc_type is not None
|
||||
assert self.exception_expected == exception_raised, self.message
|
||||
if self.verify_msg:
|
||||
exc_message = f"{exc_type.__name__}: {exc_val}"
|
||||
assert exc_message == self.message, f"expect error message {self.message}, got {exc_message}"
|
||||
|
||||
# Suppress the exception
|
||||
return True
|
||||
Reference in New Issue
Block a user