CUTLASS 3.2.1 (#1113)

* Updates for 3.2.1 release.

* Minor fix in gemm op profiler for raster order.

* Add scheduler mapping for raster order in the kernels.
This commit is contained in:
ANIKET SHIVAM
2023-09-26 14:24:26 -07:00
committed by GitHub
parent e0aaa3c3b3
commit 90d3b0fb18
428 changed files with 22253 additions and 21762 deletions

View File

@ -0,0 +1,660 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Utilities for defining Conv2D problem sizes for testing.
This file was ported from the C++ version in test/unit/conv/device/conv2d_problems.h
"""
import cutlass
from cutlass import ConvMode
from cutlass.shape import Conv2DProblemSize
class TestbedConv2dProblemSizes:
def __init__(self, minimum_channel_size: int):
conv2d_default_sizes = self.initialize_conv2d_default_sizes(minimum_channel_size)
conv2d_rigorous_sizes = self.initialize_conv2d_rigorous_sizes(minimum_channel_size)
conv2d_resnet50_sizes = self.initialize_conv2d_resnet50_sizes(1)
conv2d_resnet50_sizes_perf = self.initialize_conv2d_resnet50_sizes(34)
grouped_sizes = self.initialize_conv2d_grouped_sizes()
# Filter all problems
self.all = []
for size_list in [conv2d_default_sizes, conv2d_rigorous_sizes, conv2d_resnet50_sizes, conv2d_resnet50_sizes_perf, grouped_sizes]:
for size in size_list:
if (size.C // size.groups) % minimum_channel_size == 0:
self.all.append(size)
def initialize_conv2d_default_sizes(self, minimum_channel_size):
# Small input size x stride (1,1)
# C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
conv2d_default_sizes = []
conv2d_default_sizes.append(Conv2DProblemSize(
1, 1, 1, minimum_channel_size,
8, 1, 1, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 1, 8, minimum_channel_size,
8, 1, 3, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 7, 8, minimum_channel_size,
8, 3, 3, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 7, 9, minimum_channel_size,
8, 4, 4, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
2, 7, 9, minimum_channel_size,
8, 5, 5, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
3, 7, 9, minimum_channel_size,
8, 6, 5, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
3, 7, 9, minimum_channel_size,
8, 6, 6, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
3, 7, 9, minimum_channel_size,
8, 7, 7, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
##############################################
# Small input size x stride (2,2)
# C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
##############################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 11, 7, minimum_channel_size,
8, 1, 1, minimum_channel_size,
0, 0,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 11, 7, minimum_channel_size,
8, 3, 3, minimum_channel_size,
1, 1,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 13, 11, minimum_channel_size,
8, 1, 1, minimum_channel_size,
1, 1,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 17, 19, minimum_channel_size,
16, 2, 2, minimum_channel_size,
1, 1,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 23, 5, minimum_channel_size,
16, 3, 3, minimum_channel_size,
1, 1,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 13, 17, 8,
24, 3, 3, 8,
0, 0,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 23, 21, 8,
24, 3, 3, 8,
1, 1,
3, 3,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 20, 24, 8,
40, 3, 3, 8,
3, 3,
3, 3,
1, 1,
))
##########################################
# Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1)
##########################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 15, 19, 160,
224, 1, 1, 160,
0, 0,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 19, 37, 160,
224, 3, 3, 160,
1, 1,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 16, 16, 160,
224, 2, 3, 160,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 23, 21, 128,
224, 3, 3, 128,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 29, 37, 160,
224, 5, 5, 160,
2, 2,
1, 1,
1, 1,
))
##########################################
# C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
##########################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 15, 19, 32 + minimum_channel_size,
96, 3, 3, 32 + minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 16, 24, 64 + minimum_channel_size,
96, 3, 3, 64 + minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
##########################################
# Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)
##########################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 13, 16, 288,
160, 5, 5, 288,
2, 2,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 55, 51, 256,
512, 1, 1, 256,
0, 0,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 71, 80, 32,
64, 5, 5, 32,
2, 2,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 224, 224, 8,
64, 7, 7, 8,
3, 3,
2, 2,
1, 1,
))
##########################################
# Medium input size stride (3, 3), filter (3, 3), non-default padding
##########################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 27, 23, 256,
512, 3, 3, 256,
0, 0,
3, 3,
1, 1,
))
##########################################
# Medium input size padding > stride, asymmetric filter, padding and striding
##########################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 27, 31, 256,
512, 3, 3, 256,
5, 7,
3, 4,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 27, 35, 256,
512, 7, 5, 256,
11, 7,
3, 5,
1, 1,
))
##########################################
# Medium input size *mixed* stride (1, 2) and (2, 1),
# filter (3, 3), default padding
##########################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 27, 27, 256,
512, 3, 3, 256,
1, 1,
1, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 27, 27, 256,
512, 3, 3, 256,
1, 1,
2, 1,
1, 1,
))
######################################/
# Additional input size
######################################/
conv2d_default_sizes.append(Conv2DProblemSize(
3, 28, 28, 256,
256, 2, 2, 256,
0, 0,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 32, 32, 16,
32, 3, 3, 16,
1, 1,
6, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
32, 24, 32, 32,
32, 1, 2, 32,
0, 0,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
4, 2, 3, 256,
328, 3, 5, 256,
1, 1,
1, 1,
1, 1,
))
return conv2d_default_sizes
# Add a few large and rigorous convolution problem sizes
def initialize_conv2d_rigorous_sizes(self, minimum_channel_size):
sizes = []
if False:
sizes.append(Conv2DProblemSize.from_sizes(
(1, 124, 224, 2 * minimum_channel_size),
(24, 7, 7, 2 * minimum_channel_size),
))
sizes.append(Conv2DProblemSize.from_sizes(
(1, 233, 35, minimum_channel_size),
(24, 7, 5, minimum_channel_size),
))
return sizes
# Add resent50 layers to unit testing sizes
def initialize_conv2d_resnet50_sizes(self, batch_size):
conv2d_problem_vector = []
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 56, 56, 64,
256, 1, 1, 64,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 56, 56, 64,
64, 1, 1, 64,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 56, 56, 64,
64, 3, 3, 64,
1, 1,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 56, 56, 256,
64, 1, 1, 256,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 56, 56, 256,
512, 1, 1, 256,
0, 0,
2, 2,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 56, 56, 256,
128, 1, 1, 256,
0, 0,
2, 2,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 28, 28, 128,
128, 3, 3, 128,
1, 1,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 28, 28, 128,
512, 1, 1, 128,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 28, 28, 512,
128, 1, 1, 512,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 28, 28, 512,
1024, 1, 1, 512,
0, 0,
2, 2,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 28, 28, 512,
256, 1, 1, 512,
0, 0,
2, 2,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 14, 14, 256,
256, 3, 3, 256,
1, 1,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 14, 14, 256,
1024, 1, 1, 256,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 14, 14, 1024,
256, 1, 1, 1024,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 14, 14, 1024,
2048, 1, 1, 1024,
0, 0,
2, 2,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 14, 14, 1024,
512, 1, 1, 1024,
0, 0,
2, 2,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 7, 7, 512,
512, 3, 3, 512,
1, 1,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 7, 7, 512,
2048, 1, 1, 512,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 7, 7, 2048,
512, 1, 1, 2048,
0, 0,
1, 1,
1, 1,
))
return conv2d_problem_vector
def initialize_conv2d_grouped_sizes(self):
threadblock_n = 128
threadblock_k = 32
sizes = []
##########################################
# One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
# One CTA calculates a single group
##########################################
for cta_per_group_k in range(1, 4):
for groups in range(2, 5):
conv_k = cta_per_group_k * threadblock_n * groups
sizes.append(Conv2DProblemSize(
1, 8, 8, threadblock_k * 2 * groups,
conv_k, 3, 3, threadblock_k * 2,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
groups
))
# Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
sizes.append(Conv2DProblemSize(
1, 8, 8, threadblock_k,
threadblock_n * 2, 3, 3, threadblock_k // 2,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
2
))
sizes.append(Conv2DProblemSize(
1, 56, 56, 696,
768, 3, 3, 232,
1, 1,
2, 2,
1, 1,
ConvMode.CrossCorrelation,
1,
3
))
sizes.append(Conv2DProblemSize(
1, 14, 14, 1392,
1536, 3, 3, 232,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
3
))
##########################################
# One CTA calculate multiple groups: CTA::N % k_per_group = 0
##########################################
# 2 groups per CTA
sizes.append(Conv2DProblemSize(
1, 8, 8, threadblock_k * 4,
threadblock_n, 3, 3, threadblock_k * 2,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
2
))
# 2 groups per CTA and partial gemm_k
sizes.append(Conv2DProblemSize(
1, 8, 8, threadblock_k,
threadblock_n, 3, 3, threadblock_k // 2,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
2
))
# 4 groups per CTA
sizes.append(Conv2DProblemSize(
1, 8, 8, threadblock_k * 8,
threadblock_n // 2, 3, 3, threadblock_k * 2,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
4
))
# 4 groups per CTA and partial gemm_k
sizes.append(Conv2DProblemSize(
1, 8, 8, threadblock_k * 2,
threadblock_n // 2, 3, 3, threadblock_k // 2,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
4
))
return sizes

View File

@ -0,0 +1,146 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Low-level functionality tests for Conv2d opreations on SM80
"""
import logging
import unittest
import cutlass
from cutlass.backend.utils.device import device_cc
from conv2d_test_utils import *
cutlass.set_log_level(logging.WARNING)
cc = 80
@unittest.skipIf(device_cc() < cc, 'Device compute capability is invalid for SM80 tests.')
class Conv2dSm80(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
conv_problems = get_conv_problems()
# Tests for optimized & analytic
for conv_kind in ["fprop", "wgrad", "dgrad"]:
# F16, simt
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="simt", threadblock_shape=[128, 128, 8],
warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
# F16, tensor op
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
# F16, tensor op, analytic iterator
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
# F16, tensor op, f32 output
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
# F16, tensor op, different tile description
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 64, 32],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
# F32, simt
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
opclass="simt", threadblock_shape=[128, 128, 8],
warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
# Tf32, tensorop
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
opclass="tensor_op", threadblock_shape=[128, 128, 16],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8]
)
# Split-K
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
split_k_slices=2)
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
split_k_slices=5)
# Swizzling functor
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 64, 32],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)
# Tests for few channels and fixed channels
# F16, tensor op, few channels
for c, tb, stage, inst in zip([2, 1],
[[128, 128, 64], [128, 128, 32]],
[3, 2],
[[16, 8, 16], [16, 8, 8]]):
add_test(
Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=tb,
warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
)
# F16, tensor op, fixed channels
for c in [8, 4, 2]:
add_test(
Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
)
# Test activations
for activation in ["relu", "leaky_relu"]:
for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
add_test(
Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
split_k_slices=split_k_slices, activation=activation)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,425 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Utility functions for Conv2d tests.
"""
import torch
import cutlass
from cutlass import (
ConvKind,
ConvMode,
DataType,
DataTypeNames,
EpilogueScheduleSuffixes,
KernelScheduleSuffixes,
LayoutType,
OpcodeClassNames,
ShortDataTypeNames,
ShortLayoutTypeNames,
SplitKMode,
)
from cutlass.backend.utils.software import SubstituteTemplate
from cutlass.shape import Conv2DProblemSize
from cutlass.utils.datatypes import numpy_type, torch_type
from conv2d_problem_sizes import TestbedConv2dProblemSizes
def get_name_conv2d(
arch,
conv_kind,
element,
element_accumulator,
element_output,
opclass,
threadblock_shape,
warp_count,
instruction_shape,
stages,
iterator_algorithm,
swizzle,
split_k_mode,
split_k_slices,
activation
):
"""
Generates a procedural name for a test case for conv2d
:param arch: compute capability of kernel being generated
:type arch: int
:param conv_kind: the convolution type (i.e. fprop, dgrad, wgrad)
:type conv_kind: str
:param iterator_algorithm: the iterator algorithm applied
:type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
:param element_a: data type of operand A
:param element_b: data type of operand B
:param element_c: data type of operand C
:param element_accumulator: data type used in accumulation
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpcodeClass
:param threadblock_shape: indexable container of dimensions of threadblock tiles
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param stride_support: stride support of dgrad
:param alignment: int
:type alignment: int
:return: str
"""
if iterator_algorithm is None:
iterator_algorithm = "AUTO"
if swizzle is None:
swizzle = 1
name_format = "test_SM${arch}_Device_Conv2d_${conv_kind}_${iter_alg}_ImplicitGemm_${eA}nhwc_${eB}nhwc_${eC}nhwc_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${wM}x${wN}x${wK}_${IM}${IN}${IK}_stage${stages}_swizzle${swizzle}_${split_k_mode}${split_k_slices}_${activation}"
return SubstituteTemplate(
name_format,
{
"arch": str(arch),
"conv_kind": conv_kind,
"iter_alg": iterator_algorithm,
"eA": DataTypeNames[element],
"eB": DataTypeNames[element],
"eC": DataTypeNames[element_output],
"opclass": opclass,
"acc": DataTypeNames[element_accumulator],
"tbM": str(threadblock_shape[0]),
"tbN": str(threadblock_shape[1]),
"tbK": str(threadblock_shape[2]),
"wM": str(threadblock_shape[0] // warp_count[0]),
"wN": str(threadblock_shape[1] // warp_count[1]),
"wK": str(threadblock_shape[2] // warp_count[2]),
"IM": str(instruction_shape[0]),
"IN": str(instruction_shape[1]),
"IK": str(instruction_shape[2]),
"stages": str(stages),
"swizzle": str(swizzle),
"split_k_mode": split_k_mode,
"split_k_slices": str(split_k_slices),
"activation": activation
}
)
def conv2d_few_channel_problemsizes(channels):
problem_sizes = [
Conv2DProblemSize(
1, 8, 8, channels,
16, 3, 3, channels,
1, 1,
2, 2,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 16, 16, channels,
16, 3, 3, channels,
1, 1,
2, 2,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 16, 16, channels,
16, 7, 7, channels,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 224, 224, channels,
32, 7, 7, channels,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 224, 224, channels,
64, 7, 7, channels,
1, 1,
2, 2,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 224, 224, channels,
64, 5, 5, channels,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 224, 224, channels,
64, 5, 5, channels,
1, 1,
2, 2,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
]
return problem_sizes
def validate_problem_size(ps, conv_kind, split_k_slices):
P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
if P != ps.P or Q != ps.Q:
return False
# Split-K (serial or parallel) is not supported for strided dgrad
if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
return False
return True
class Conv2dLauncherFrontend:
def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
self.operation = plan
self.conv_kind = plan.conv_kind
self.seed = seed
self.backend = backend
self.dtype_A = plan._element_a
self.dtype_B = plan._element_b
self.dtype_C = plan._element_c
self.dtype_acc = plan._element_accumulator
self.layout_A = LayoutType.TensorNHWC
self.layout_B = LayoutType.TensorNHWC
self.layout_C = LayoutType.TensorNHWC
self.layout_D = LayoutType.TensorNHWC
self.element_compute = DataType.f32
if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
self.rand_max = 1
else:
self.rand_max = 4
self.activation = plan.activation
def uniform_init(self, size, dtype):
tensor = torch.ceil(
torch.empty(size=size, dtype=torch_type(dtype), device="cuda").uniform_(-self.rand_max - 0.5, self.rand_max - 0.5)
).to(memory_format=torch.channels_last)
return tensor
def reference(self, ps, A, B, C, alpha, beta, activation):
if self.conv_kind == ConvKind.Fprop:
torch_result = alpha * torch.ops.aten.conv2d(
A,
B,
stride=(ps.stride_h, ps.stride_w),
padding=(ps.pad_h, ps.pad_w),
dilation=(ps.dilation_h, ps.dilation_w)
) + beta * C
elif self.conv_kind == ConvKind.Dgrad:
torch_result = alpha * torch.nn.grad.conv2d_input(
(ps.N, ps.C, ps.H, ps.W),
B,
A,
padding=(ps.pad_h, ps.pad_w),
stride=(ps.stride_h, ps.stride_w)
) + beta * C
elif self.conv_kind == ConvKind.Wgrad:
torch_result = alpha * torch.nn.grad.conv2d_weight(
B,
(ps.K, ps.C, ps.R, ps.S),
A,
padding=(ps.pad_h, ps.pad_w),
stride=(ps.stride_h, ps.stride_w)
) + beta * C
else:
raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
if activation == cutlass.backend.epilogue.relu:
torch_result = torch.nn.functional.relu(torch_result)
elif activation == cutlass.backend.epilogue.leaky_relu:
torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
return torch_result
def run(self, ps, split_k_mode=SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
if self.conv_kind == ConvKind.Fprop:
tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
elif self.conv_kind == ConvKind.Dgrad:
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
elif self.conv_kind == ConvKind.Wgrad:
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
else:
raise Exception(f"Conv kind {self.conv_kind} is not supported")
torch.manual_seed(self.seed)
tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
tensor_D = torch.zeros_like(tensor_C).to(memory_format=torch.channels_last)
self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
stride=(ps.stride_h, ps.stride_w),
padding=(ps.pad_h, ps.pad_w),
dilation=(ps.dilation_h, ps.dilation_w),
alpha=alpha, beta=beta,
split_k=(split_k_mode, split_k_slices))
tensor_D_ref = self.reference(ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation)
torch.cuda.synchronize()
passed = torch.equal(tensor_D, tensor_D_ref)
return passed
def add_test(
cls,
cc,
conv_kind,
problem_sizes,
element,
element_accumulator,
element_output,
opclass,
threadblock_shape,
warp_count,
instruction_shape,
stages,
iterator_algorithm=None,
swizzle=None,
split_k_mode="serial",
split_k_slices=1,
activation = "identity"
):
"""Create a test-running function with the given specification"""
test_name = get_name_conv2d(
cc, conv_kind, element, element_accumulator,
element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
def run(self):
# Create the plan
plan = cutlass.Conv2d(
kind=conv_kind,
element=element,
element_accumulator=element_accumulator,
element_C=element_output,
element_D=element_output
)
# Set the opclass
plan.opclass = opclass
# Set the tile description
td = {
"threadblock_shape": threadblock_shape,
"warp_count": warp_count,
"stages": stages,
"instruction_shape": instruction_shape,
}
plan.tile_description = td
# Set iterator algorithm
if iterator_algorithm is not None:
plan.iterator_algorithm = iterator_algorithm
# Set swizzling functor
if swizzle is not None:
plan.swizzling_stride = swizzle
if activation != "identity":
if activation == "leaky_relu":
plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
else:
plan.activation = getattr(cutlass.epilogue, activation)
conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="torch")
for ps in problem_sizes:
if not validate_problem_size(ps, conv_kind, split_k_slices): continue
self.assertTrue(conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 2.0))
setattr(cls, test_name, run)
return run
def get_conv_problems():
# 64: minimum channel size
conv_problems = TestbedConv2dProblemSizes(64).all
# Insert alignment 4 & 2 tests
conv_problems += [
Conv2DProblemSize(
1, 4, 4, 12,
8, 3, 3, 12,
0, 0,
3, 3,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 4, 4, 14,
8, 3, 3, 14,
0, 0,
3, 3,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 23, 56, 98,
128, 3, 3, 98,
4, 5,
3, 3,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
]
return conv_problems

View File

@ -0,0 +1,44 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import pathlib
import unittest
if __name__ == '__main__':
loader = unittest.TestLoader()
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
tests = loader.discover(script_dir, 'conv2d_*.py')
testRunner = unittest.runner.TextTestRunner()
results = testRunner.run(tests)
if not results.wasSuccessful():
raise Exception('Test cases failed')

View File

@ -0,0 +1,308 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Tests emitting a CUTLASS kernel to a PyTorch CUDA extension
"""
import random
import tempfile
import unittest
import cutlass
if cutlass.utils.datatypes.torch_available:
import torch
def _initialize(dtype, M: int, N: int, K: int):
"""
Utility function to initialize A, B, C, and D matrices corresponding to dimensions M, N, and K
:param dtype: data type of tensors
:param M: M dimension of GEMM problem
:type M: int
:param N: N dimension of GEMM problem
:type N: int
:param K: N dimension of GEMM problem
:type K: int
:return: initialized tensors A, B, C, and D
:rtype: list
"""
sizes = [(M, K), (K, N), (M, N), (M, N)]
return [torch.randint(-3, 3, size, device='cuda').to(dtype) for size in sizes]
def _generate_problems(dtype, num):
"""
Utility function to generate `num` GEMMs of random sizes
:param dtype: data type of tensors
:param num: number of GEMMs to generate
:type num: int
:return: lists of A, B, C, and D tensors
:rtype: list
"""
valid_sizes = [128, 256, 512, 1024]
As, Bs, Cs, Ds = [], [], [], []
for _ in range(num):
M, N, K = [random.choice(valid_sizes) for _ in range(3)]
A, B, C, D = _initialize(dtype, M, N, K)
As.append(A)
Bs.append(B)
Cs.append(C)
Ds.append(D)
return As, Bs, Cs, Ds
def _generate_conv2d_problem(conv_kind, dtype, ps):
"""
Utility function to generate conv2d inputs
:param conv_kind: kind of convolution
:type conv_kind: str
:param dtype: data type of tensors
:param problem_size: the conv2d problem size
:type problem_size: cutlass.shape.Conv2DProblemSize
:return: initialized tensors A, B, C, and D
:rtype: list
"""
if conv_kind == "fprop":
tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
elif conv_kind == "dgrad":
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
else:
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
sizes = [tensor_A_size, tensor_B_size, tensor_C_size]
return [torch.ceil(torch.empty(size, dtype=dtype, device='cuda').uniform_(-4.5, 3.5)).to(memory_format=torch.channels_last) for size in sizes]
@unittest.skipIf(not cutlass.utils.datatypes.torch_available, 'PyTorch must be available to run PyTorch extension tests')
class PyTorchExtensionTest(unittest.TestCase):
def test_gemm(self):
random.seed(2023)
dtype = torch.float16
plan = cutlass.op.Gemm(element=dtype, layout=cutlass.LayoutType.RowMajor)
plan.activation = cutlass.epilogue.relu
op = plan.construct()
with tempfile.TemporaryDirectory() as tmpdir:
mod = cutlass.emit.pytorch(op, name='gemm_mod', cc=plan.cc, sourcedir=tmpdir, jit=True)
A, B, C, _ = _initialize(dtype, 1024, 256, 512)
D_ref = torch.nn.functional.relu(A @ B)
D = mod.run(A, B)
assert torch.allclose(D, D_ref)
D = mod.run(A, B, C)
assert torch.allclose(D, D_ref)
D = mod.run(A, B, C, 1.0)
assert torch.allclose(D, D_ref)
D = mod.run(A, B, C, 1.0, 0.0)
assert torch.allclose(D, D_ref)
alpha = 2.0
beta = -1.0
D_ref = torch.nn.functional.relu((A @ B) * alpha + (beta * C))
D = mod.run(A, B, C, alpha, beta)
assert torch.allclose(D, D_ref)
def test_grouped_gemm(self):
random.seed(2023)
dtype = torch.float16
plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor)
op = plan.construct()
with tempfile.TemporaryDirectory() as tmpdir:
mod = cutlass.emit.pytorch(op, name='grouped_gemm_mod', cc=plan.cc, sourcedir=tmpdir, jit=True)
As, Bs, Cs, _ = _generate_problems(dtype, 50)
def check_all(X, Y):
for x, y in zip(X, Y):
assert torch.allclose(x, y)
Ds_ref = [a @ b for a, b in zip(As, Bs)]
Ds = mod.run(As, Bs)
check_all(Ds, Ds_ref)
Ds = mod.run(As, Bs, Cs)
check_all(Ds, Ds_ref)
Ds = mod.run(As, Bs, Cs, 1.0)
check_all(Ds, Ds_ref)
Ds = mod.run(As, Bs, Cs, 1.0, 0.0)
check_all(Ds, Ds_ref)
alpha = 2.0
beta = -1.0
Ds_ref = [(a @ b) * alpha + (beta * c) for a, b, c in zip(As, Bs, Cs)]
Ds = mod.run(As, Bs, Cs, alpha, beta)
check_all(Ds, Ds_ref)
def test_conv2d_fprop(self):
torch.manual_seed(2023)
dtype = torch.float16
plan = cutlass.op.Conv2d(kind="fprop", element=dtype, element_accumulator=torch.float32)
plan.activation = "relu"
op = plan.construct()
with tempfile.TemporaryDirectory() as tmpdir:
mod = cutlass.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
problem_size = cutlass.shape.Conv2DProblemSize(
1, 4, 4, 16,
8, 3, 3, 16,
0, 0,
3, 3,
1, 1
)
A, B, C = _generate_conv2d_problem("fprop", dtype, problem_size)
stride = (problem_size.stride_h, problem_size.stride_w)
padding = (problem_size.pad_h, problem_size.pad_w)
alpha = 1.0
beta = 0.5
D_ref = alpha * torch.ops.aten.conv2d(
A, B, stride=stride, padding=padding
) + beta * C
D_ref = torch.nn.functional.relu(D_ref)
D = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta)
assert torch.allclose(D, D_ref)
# Test serial split-K
D_serial_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
assert torch.allclose(D, D_serial_split_k)
# Test parallel split-K
D_parallel_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
assert torch.allclose(D, D_parallel_split_k)
def test_conv2d_dgrad(self):
torch.manual_seed(2023)
dtype = torch.float16
plan = cutlass.op.Conv2d(kind="dgrad", element=dtype, element_accumulator=torch.float32)
op = plan.construct()
with tempfile.TemporaryDirectory() as tmpdir:
mod = cutlass.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
problem_size = cutlass.shape.Conv2DProblemSize(
1, 4, 4, 16,
8, 3, 3, 16,
0, 0,
3, 3,
1, 1,
cutlass.ConvMode.CrossCorrelation,
1, 1
)
A, B, C = _generate_conv2d_problem("dgrad", dtype, problem_size)
stride = (problem_size.stride_h, problem_size.stride_w)
padding = (problem_size.pad_h, problem_size.pad_w)
alpha = 1.0
beta = 0.5
input_size = (problem_size.N, problem_size.C, problem_size.H, problem_size.W)
D_ref = alpha * torch.nn.grad.conv2d_input(
input_size, B, A,
stride=stride, padding=padding
) + beta * C
D = mod.run(input_size, A, B, C, stride, padding, alpha=alpha, beta=beta, )
assert torch.allclose(D, D_ref)
def test_conv2d_wgrad(self):
torch.manual_seed(2023)
dtype = torch.float16
plan = cutlass.op.Conv2d(kind="wgrad", element=dtype, element_accumulator=torch.float32)
op = plan.construct()
with tempfile.TemporaryDirectory() as tmpdir:
mod = cutlass.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
problem_size = cutlass.shape.Conv2DProblemSize(
1, 4, 4, 16,
8, 3, 3, 16,
0, 0,
3, 3,
1, 1,
cutlass.ConvMode.CrossCorrelation,
1, 1
)
A, B, C = _generate_conv2d_problem("wgrad", dtype, problem_size)
stride = (problem_size.stride_h, problem_size.stride_w)
padding = (problem_size.pad_h, problem_size.pad_w)
alpha = 1.0
beta = 0.5
weight_size = (problem_size.K, problem_size.C, problem_size.R, problem_size.S)
D_ref = alpha * torch.nn.grad.conv2d_weight(
B, weight_size, A,
stride=stride, padding=padding
) + beta * C
D = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta)
assert torch.allclose(D, D_ref)
# Test serial split-K
D_serial_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
assert torch.allclose(D, D_serial_split_k)
# Test parallel split-K
D_parallel_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
assert torch.allclose(D, D_parallel_split_k)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,100 @@
################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
"""
Unit test for compute node in SM90
"""
import logging
import unittest
import cutlass
from cutlass.backend import *
from cutlass.epilogue import *
from cutlass import swizzle
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
cutlass.set_log_level(logging.WARNING)
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
class TestEVTComputeSM90(EVTTestCaseBase):
def test_arith(self):
"""
Test Arithmatic op
"""
def evt_arith_compute(accum, C, alpha, beta, gamma):
D = ((accum + C) * alpha - gamma) / beta
return D
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"C": self.fake_tensor(self.element, (l, m, n)),
"alpha": 1.5,
"beta": 0.5,
"gamma": 2.5,
"D": self.fake_tensor(self.element, (l, m, n))
}
launcher = EVTTestBed(self.element, evt_arith_compute, example_inputs)
input_keys = ["C", "alpha", "beta", "gamma"]
result_keys = ["D"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_func_call(self):
"""
Test Function call
"""
def evt_func_call(accum, C, alpha, beta, gamma):
D = multiply_add(relu(accum + alpha) + C, beta, gamma)
return D
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"C": self.fake_tensor(self.element, (l, m, n)),
"alpha": 1.5,
"beta": 0.5,
"gamma": 2.5,
"D": self.fake_tensor(self.element, (l, m, n))
}
launcher = EVTTestBed(self.element, evt_func_call, example_inputs)
input_keys = ["C", "alpha", "beta", "gamma"]
result_keys = ["D"]
launcher.verify((m, n, k), input_keys, result_keys, l)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,173 @@
################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
"""
Unit test for store nodes in SM90
"""
import logging
import unittest
import cutlass
from cutlass.backend import *
from cutlass.epilogue import *
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
cutlass.set_log_level(logging.WARNING)
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
class TestEVTLayoutSM90(EVTTestCaseBase):
def test_permute_1(self):
"""
Returning a tensor with shape [m, n]
"""
def evt_permute(accum, alpha, C):
F = alpha * accum
F_permute = permute(F, indices=(0, 2, 1))
D_permute = F_permute + permute(C, indices=(0, 2, 1))
D = permute(D_permute, indices=(0, 2, 1))
return D, F
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 0.5,
"C": self.fake_tensor(self.element, (l, m, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_permute, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F"]
launcher.verify((m, n, k), input_keys, result_keys, l)
@unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
def test_permute_2(self):
"""
Returning a tensor with shape [m, n]
"""
def evt_permute(accum, alpha, C):
F = alpha * accum
F_permute = permute(F, indices=(0, 2, 1))
D = F_permute + C
return D, F
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 0.5,
"C": self.fake_tensor(self.element, (l, n, m)),
"F": self.fake_tensor(self.element, (l, m, n)),
"D": self.fake_tensor(self.element, (l, n, m)),
}
launcher = EVTTestBed(self.element, evt_permute, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F"]
launcher.verify((m, n, k), input_keys, result_keys, l)
@unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
def test_permute_3(self):
"""
Returning a tensor with shape [m, n]
"""
def evt_permute(accum, alpha, C):
F = alpha * accum
F_permute = permute(F, indices=(1, 0, 2))
D = F_permute + C
return D, F
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 0.5,
"C": self.fake_tensor(self.element, (m, l, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"D": self.fake_tensor(self.element, (m, l, n)),
}
launcher = EVTTestBed(self.element, evt_permute, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_reshape(self):
"""
Test reshape
"""
def evt_reshape(accum, alpha, TensorE):
F = alpha * accum
E_reshape = reshape(TensorE, new_shape=(512, 1))
D = F + E_reshape
return D
example_inputs = {
"accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
"alpha": 0.5,
"TensorE": self.fake_tensor(self.element, (16, 32)),
"D": self.fake_tensor(self.element, (self.l, self.m, self.n)),
}
launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
input_keys = ["alpha", "TensorE"]
result_keys = ["D"]
launcher.verify(self.problem_size, input_keys, result_keys, self.l)
def test_reshape2(self):
"""
Test reshape
"""
def evt_reshape(accum, alpha, TensorE):
F = alpha * accum
F_reshape = reshape(F, new_shape=(2, 3, 512, 256))
D = F_reshape + TensorE
return D
example_inputs = {
"accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
"alpha": 0.5,
"TensorE": self.fake_tensor(self.element, (2, 3, 1, self.n)),
"D": self.fake_tensor(self.element, (2, 3, self.m, self.n)),
}
launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
input_keys = ["alpha", "TensorE"]
result_keys = ["D"]
launcher.verify(self.problem_size, input_keys, result_keys, self.l)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,142 @@
################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
"""
Unit test for load nodes in SM90
"""
import logging
import unittest
import cutlass
from cutlass.backend import *
from cutlass.epilogue import *
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
cutlass.set_log_level(logging.WARNING)
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
class TestEVTLoadSM90(EVTTestCaseBase):
def test_tensor_load(self):
"""
Load extra tensor with shape [m, n]
"""
def evt_tensor_load(accum, C, aux, aux_batch):
D = accum + C + aux + aux_batch
return D
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"C": self.fake_tensor(self.element, (l, m, n)),
"aux": self.fake_tensor(self.element, (m, n)),
"aux_batch": self.fake_tensor(np.float32, (l, m, n)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_tensor_load, example_inputs)
input_keys = ["C", "aux", "aux_batch"]
result_keys = ["D"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_row_broadcast(self):
"""
Load extra tensor with shape [1, n]
"""
def evt_row_broadcast(accum, C, bias, bias_batch):
D = accum + C + bias + bias_batch
return D
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"C": self.fake_tensor(self.element, (l, m, n)),
"bias": self.fake_tensor(self.element, (n,)),
"bias_batch": self.fake_tensor(np.float32, (l, 1, n)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_row_broadcast, example_inputs)
input_keys = ["C", "bias", "bias_batch"]
result_keys = ["D"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_column_broadcast(self):
"""
Load extra tensor with shape [m, 1]
"""
def evt_column_broadcast(accum, C, bias, bias_batch):
D = accum + C + bias + bias_batch
return D
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"C": self.fake_tensor(self.element, (l, m, n)),
"bias": self.fake_tensor(self.element, (m, 1)),
"bias_batch": self.fake_tensor(np.float32, (l, m, 1)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_column_broadcast, example_inputs)
input_keys = ["C", "bias", "bias_batch"]
result_keys = ["D"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_scalar_broadcast(self):
"""
Load extra tensor with shape [1, 1]
"""
def evt_scalar_broadcast(accum, C, alpha, alpha_batch):
D = accum + C + alpha + alpha_batch
return D
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"C": self.fake_tensor(self.element, (l, m, n)),
"alpha": 0.5,
"alpha_batch": self.fake_tensor(np.float32, (l, 1, 1)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_scalar_broadcast, example_inputs)
input_keys = ["C", "alpha", "alpha_batch"]
result_keys = ["D"]
launcher.verify((m, n, k), input_keys, result_keys, l)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,274 @@
################################################################################
#
# Copyright (c) 2023 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
"""
Unittest for mixed types of nodes in SM90
"""
import logging
import unittest
import cutlass
from cutlass.backend import *
from cutlass.epilogue import *
from cutlass.swizzle import ThreadblockSwizzleStreamK
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
cutlass.set_log_level(logging.WARNING)
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
class TestEVTMixedSM90(EVTTestCaseBase):
def test_mixed_dag(self):
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
F = alpha * accum + (beta * C + aux)
F_row_max = max(F, dim=[0, 1])
E = relu(F + 1) + cbias + rbias
E_col_max = max(E, dim=[0, 2])
D = E + F
return D, F, F_row_max, E_col_max
if device_cc() == 80:
aligments = [2, 4, 8]
else:
# Sm90 EVT currently only supports 128-bit alignment
aligments = [8,]
for align in aligments:
for m, n, k, l in self.get_problem_sizes(align):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 1.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"beta": 1.0,
"aux": self.fake_tensor(self.element, (l, m, n)),
"cbias": self.fake_tensor(self.element, (m, 1)),
"rbias": self.fake_tensor(self.element, (n,)),
"D": self.fake_tensor(self.element, (l, m, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
}
launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs)
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
result_keys = ["D", "F", "F_row_max", "E_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
def test_mixed_dag_float(self):
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
F = alpha * accum + (beta * C + aux)
F_row_max = max(F, dim=[0, 1])
E = relu(F + 1) + cbias + rbias
E_col_max = max(E, dim=[0, 2])
D = E + F
return D, F, F_row_max, E_col_max
for align in [3, 2, 4]:
for m, n, k, l in self.get_problem_sizes(align):
example_inputs = {
"accum": self.fake_tensor(np.float32, (l, m, n)),
"alpha": 1.0,
"C": self.fake_tensor(np.float32, (l, m, n)),
"beta": 1.0,
"aux": self.fake_tensor(np.float32, (l, m, n)),
"cbias": self.fake_tensor(np.float32, (m, 1)),
"rbias": self.fake_tensor(np.float32, (n,)),
"D": self.fake_tensor(np.float32, (l, m, n)),
"F": self.fake_tensor(np.float32, (l, m, n)),
"F_row_max": self.fake_tensor(np.float32, (n,)),
"E_col_max": self.fake_tensor(np.float32, (m, 1))
}
launcher = EVTTestBed(DataType.f32, evt_mixed_dag, example_inputs)
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
result_keys = ["D", "F", "F_row_max", "E_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
def test_mixed_dag_stage2(self):
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
F = alpha * accum + (beta * C + aux)
F_row_max = max(F, dim=[0, 1])
E = relu(F + 1) + cbias + rbias
E_col_max = max(E, dim=[0, 2])
D = E + F
return D, F, F_row_max, E_col_max
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 1.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"beta": 1.0,
"aux": self.fake_tensor(self.element, (l, m, n)),
"cbias": self.fake_tensor(self.element, (m, 1)),
"rbias": self.fake_tensor(self.element, (n,)),
"D": self.fake_tensor(self.element, (l, m, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
}
launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, epilogue_stages=2)
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
result_keys = ["D", "F", "F_row_max", "E_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
def test_mixed_dag_partition_k(self):
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
F = alpha * accum + (beta * C + aux)
F_row_max = max(F, dim=[0, 1])
E = relu(F + 1) + cbias + rbias
E_col_max = max(E, dim=[0, 2])
D = E + F
return D, F, F_row_max, E_col_max
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 1.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"beta": 1.0,
"aux": self.fake_tensor(self.element, (l, m, n)),
"cbias": self.fake_tensor(self.element, (m, 1)),
"rbias": self.fake_tensor(self.element, (n,)),
"D": self.fake_tensor(self.element, (l, m, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
}
tile_description = {
"threadblock_shape": [128, 128, 64],
"warp_count": [2, 2, 2]
}
launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, tile_description=tile_description, epilogue_stages=2)
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
result_keys = ["D", "F", "F_row_max", "E_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
def test_mixed_dag_stream_k(self):
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
F = alpha * accum + (beta * C + aux)
F_row_max = max(F, dim=[0, 1])
E = relu(F + 1) + cbias + rbias
E_col_max = max(E, dim=[0, 2])
D = E + F
return D, F, F_row_max, E_col_max
# High per-sm occupancy tile_description
tile_description = {
"threadblock_shape": [128, 128, 32],
"warp_count": [2, 2, 1],
"stages": 3
}
tds = [None, tile_description]
for td in tds:
for m, n, k, l in self.get_problem_sizes(8, k=960, batch_count=[1, 3]):
if l == 1:
example_inputs = {
"accum": self.fake_tensor(self.element, (m, n)),
"alpha": 1.0,
"C": self.fake_tensor(self.element, (m, n)),
"beta": 1.0,
"aux": self.fake_tensor(self.element, (m, n)),
"cbias": self.fake_tensor(self.element, (m, 1)),
"rbias": self.fake_tensor(self.element, (n,)),
"D": self.fake_tensor(self.element, (m, n)),
"F": self.fake_tensor(self.element, (m, n)),
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
}
else:
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 1.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"beta": 1.0,
"aux": self.fake_tensor(self.element, (l, m, n)),
"cbias": self.fake_tensor(self.element, (m, 1)),
"rbias": self.fake_tensor(self.element, (n,)),
"D": self.fake_tensor(self.element, (l, m, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
}
if td is not None:
launcher = EVTTestBed(
self.element, evt_mixed_dag, example_inputs,
tile_description=td,
swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
else:
launcher = EVTTestBed(
self.element, evt_mixed_dag, example_inputs,
swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
result_keys = ["D", "F", "F_row_max", "E_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_mixed_dag_no_batch(self):
def evt_mixed_dag_no_batch(accum, alpha, C, beta, aux, cbias, rbias):
F = alpha * accum + (beta * C + aux)
F_row_max = max(F, dim=[0, 1])
E = relu(F + 1) + cbias + rbias
E_col_max = max(E, dim=[0, 2])
D = E + F
return D, F, F_row_max, E_col_max
for m, n, k, _ in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (m, n)),
"alpha": 1.0,
"C": self.fake_tensor(self.element, (m, n)),
"beta": 1.0,
"aux": self.fake_tensor(self.element, (m, n)),
"cbias": self.fake_tensor(self.element, (m, 1)),
"rbias": self.fake_tensor(self.element, (n,)),
"D": self.fake_tensor(self.element, (m, n)),
"F": self.fake_tensor(self.element, (m, n)),
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
}
launcher = EVTTestBed(self.element, evt_mixed_dag_no_batch, example_inputs)
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
result_keys = ["D", "F", "F_row_max", "E_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, 1)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,155 @@
################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
"""
Unit test for store nodes in SM90
"""
import logging
import unittest
import cutlass
from cutlass.backend import *
from cutlass.epilogue import *
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
cutlass.set_log_level(logging.WARNING)
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
class TestEVTStoreSM90(EVTTestCaseBase):
def test_aux_store(self):
"""
Returning a tensor with shape [m, n]
"""
def evt_aux_store(accum, alpha, C):
F = alpha * accum
D = F + C
return D, F
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 0.5,
"C": self.fake_tensor(self.element, (l, m, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_col_reduce(self):
"""
Reduction [m, n] -> [m, 1]
"""
def evt_row_reduce(accum, alpha, C):
acc_row_max = max(accum, dim=[2,])
F = alpha * accum
F_row_max = max(F, dim=[0, 2])
D = F + C
return D, F_row_max, acc_row_max
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 2.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"F_row_max": self.fake_tensor(np.float32, (m, 1)),
"acc_row_max": self.fake_tensor(np.float32, (l, m, 1)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_row_reduce, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F_row_max", "acc_row_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_row_reduce(self):
"""
Reduction [m, n] -> [n]
"""
def evt_col_reduce(accum, alpha, C):
acc_col_max = max(accum, dim=[1,])
F = alpha * accum
F_col_max = max(F, dim=[0, 1])
D = F + C
return D, F_col_max, acc_col_max
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 2.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"F_col_max": self.fake_tensor(np.float32, (n,)),
"acc_col_max": self.fake_tensor(np.float32, (l, 1, n)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_col_reduce, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F_col_max", "acc_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_scalar_reduce(self):
"""
Reduction [m, n] -> [1,]
"""
def evt_scalar_reduce(accum, alpha, C):
acc_max = max(accum, dim=[1, 2])
F = alpha * accum
F_max = max(F, dim=[0, 1, 2])
D = F + C
return D, F_max, acc_max
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 2.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"acc_max": self.fake_tensor(np.float32, (l, 1, 1)),
"F_max": self.fake_tensor(np.float32, (1,)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_scalar_reduce, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F_max", "acc_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,44 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import pathlib
import unittest
if __name__ == '__main__':
loader = unittest.TestLoader()
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
tests = loader.discover(script_dir, 'evt_*.py')
testRunner = unittest.runner.TextTestRunner()
results = testRunner.run(tests)
if not results.wasSuccessful():
raise Exception('Test cases failed')

View File

@ -0,0 +1,230 @@
################################################################################
#
# Copyright (c) 20123 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
"""
Testbed classes of EVT
"""
import torch
import unittest
import cutlass
from cutlass import Tensor
import cutlass.backend.evt
from cutlass.profiler import CUDAEventProfiler
from cutlass.shape import GemmCoord
from cutlass.utils.datatypes import torch_type
class EVTReferenceModule:
def __init__(self, layout_A, layout_B, layout_C, epilogue_visitor):
self.layout_A = layout_A
self.layout_B = layout_B
self.layout_C = layout_C
self.epilogue_visitor = epilogue_visitor
def run(self, A, B, C, problem_size, alpha, beta, batch=1):
if self.layout_A == cutlass.LayoutType.RowMajor:
A_row = A.view((batch, problem_size.m, problem_size.k))
else:
A_col = A.view((batch, problem_size.k, problem_size.m))
A_row = torch.permute(A_col, (0, 2, 1))
if self.layout_B == cutlass.LayoutType.RowMajor:
B_row = B.view((batch, problem_size.k, problem_size.n))
else:
B_col = B.view((batch, problem_size.n, problem_size.k))
B_row = torch.permute(B_col, (0, 2, 1))
if self.layout_C == cutlass.LayoutType.RowMajor:
C_row = C.view((batch, problem_size.m, problem_size.n))
else:
C_col = C.view((batch, problem_size.n, problem_size.m))
C_row = torch.permute(C_col, (0, 2, 1))
out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
if self.layout_C == cutlass.LayoutType.ColumnMajor:
out = torch.permute(out_row, (0, 2, 1))
else:
out = out_row
return torch.flatten(out)
def __call__(self, A, B, C, problem_size, batch=1, epilogue_args=None):
# Running the mainloop
accum = self.run(
A, B, C, problem_size, 1.0, 0.0, batch=batch
).reshape(batch, problem_size.m, problem_size.n)
# Running the epilogue
epilogue_args["accum"] = accum
references = self.epilogue_visitor(**epilogue_args)
# Return the results
if not isinstance(references, tuple):
references = (references,)
return references
class EVTTestBed:
"""
Epilogue Visitor Testbed
"""
def __init__(self, element, evt_fn, example_inputs, profile=False, **kwargs) -> None:
self.element = element
layout = cutlass.LayoutType.RowMajor
self.example_inputs = example_inputs
# Create the Gemm plan
self.plan = cutlass.op.Gemm(element=element, layout=layout, element_accumulator=torch.float32)
if "tile_description" in kwargs:
self.plan.tile_description = kwargs["tile_description"]
if "swizzling_functor" in kwargs:
self.plan.swizzling_functor = kwargs["swizzling_functor"]
# Compile the epilogue visitor
epilogue_visitor = cutlass.epilogue.trace(evt_fn, example_inputs)
if "epilogue_stages" in kwargs:
epilogue_visitor.epilogue_stages = kwargs["epilogue_stages"]
self.plan.epilogue_visitor = epilogue_visitor
# Reference model
self.reference_fn = EVTReferenceModule(layout, layout, layout, epilogue_visitor)
self.profile = profile
def get_torch_tensor(self, shape, dtype=None, fill=None):
if dtype is None:
dtype = self.element
dtype = torch_type(dtype)
if fill is None:
return torch.ceil(
torch.empty(size=shape, dtype=dtype, device="cuda").uniform_(-4.5, 3.5)
)
else:
return torch.full(shape, fill, dtype=dtype, device="cuda")
def verify(self, problem_size, input_keys, result_keys, batch_count=1):
"""
Verify the results
"""
problem_size = GemmCoord(*problem_size)
# Initiate the GEMM arguments
tensor_A = self.get_torch_tensor((batch_count, problem_size.m, problem_size.k))
tensor_B = self.get_torch_tensor((batch_count, problem_size.k, problem_size.n))
# Initialize the epilogue args
epilogue_args = {}
for key in self.example_inputs.keys():
if key in input_keys:
tensor = self.example_inputs[key]
if isinstance(tensor, Tensor):
epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element)
else:
epilogue_args[key] = tensor
elif key in result_keys:
tensor = self.example_inputs[key]
if isinstance(tensor, Tensor):
if "max" in key:
fill = -1000
else:
fill = 0
epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element, fill=fill)
else:
epilogue_args[key] = tensor
tensor_D = epilogue_args["D"]
if "C" in epilogue_args:
tensor_C = epilogue_args["C"]
else:
tensor_C = tensor_D
# Run the device kernel
self.plan.run(tensor_A, tensor_B, tensor_C, tensor_D, visitor_args=epilogue_args)
# Run the host reference
evt_args_inputs = {}
for key in input_keys:
evt_args_inputs[key] = epilogue_args[key]
reference_results = self.reference_fn(
tensor_A, tensor_B, tensor_C, problem_size, batch_count, evt_args_inputs)
# Compare the results
for result, ref in zip(result_keys, reference_results):
assert torch.equal(epilogue_args[result].flatten(), ref.flatten())
# Run profile
if self.profile:
profiler = CUDAEventProfiler(
self.plan, 100, 100, tensor_A, tensor_B, tensor_C, tensor_D,
visitor_args = epilogue_args
)
print(f"Cutlass Python Duration: {profiler()}")
class EVTTestCaseBase(unittest.TestCase):
"""
Base class for EVT Unittest
"""
def __init__(self, methodName: str = "runTest", lmnk=(6, 512, 256, 128)) -> None:
super().__init__(methodName)
self.element = cutlass.DataType.f16
self.l, self.m, self.n, self.k = lmnk
self.problem_size = (self.m, self.n, self.k)
torch.random.manual_seed(42)
def fake_tensor(self, element, shape):
return Tensor(element=element, shape=shape, layout_tag=cutlass.LayoutType.RowMajor)
def get_problem_sizes(self, alignment, k=None, batch_count=[3,]):
k = k if k else self.k
problem_size_m = [alignment, 512 - 3 * alignment]
problem_size_n = [alignment, 512 - alignment]
if alignment % 8 == 0:
problem_size_m.append(768)
problem_size_n.append(768)
problem_size_l = batch_count
problem_sizes = []
for m in problem_size_m:
for n in problem_size_n:
for l in problem_size_l:
problem_sizes.append((m, n, k, l))
return problem_sizes

View File

@ -0,0 +1,134 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
High-level tests for running batched GEMMs
"""
from functools import partial
import logging
from math import prod
import unittest
import cutlass
from cutlass.backend.utils.device import device_cc
import torch
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
torch.manual_seed(2023)
def pytorch_reference(A, B, C, alpha, beta):
# Get the batch count. Assume that any of A, B, and C
# with a batch dimension ahve matching batch count. Thus,
# we break out of the loop once we have found the first
# tensor containing a batch dimension.
batch_count = (1,)
for tensor in [A, B, C]:
if len(tensor.shape) > 2:
batch_count = tensor.shape[:-2]
break
int_batch_count = prod(batch_count)
def add_batch(tensor):
if len(tensor.shape) == 2:
return tensor.unsqueeze(0).repeat(int_batch_count, 1, 1)
else:
return tensor.reshape(-1, tensor.size(-2), tensor.size(-1))
# Reshape tensors to have batch dimension
A = add_batch(A)
B = add_batch(B)
C = add_batch(C)
ret = (torch.bmm(A, B) * alpha) + (C * beta)
reshape_vals = batch_count + C.shape[-2:]
return ret.reshape(*reshape_vals)
def initialize(rows, cols, batch):
tensor = torch.randint(-3, 3, size=(rows*cols*prod(batch),), device='cuda').half()
if len(batch) > 0 and prod(batch) > 1:
reshape_vals = batch + (rows, cols)
return tensor.reshape(*reshape_vals)
else:
return tensor.reshape(rows, cols)
class GemmF16Batched(unittest.TestCase):
def run_batched(self, batch_count: tuple, batch_A: bool, batch_B: bool, batch_C: bool):
M = 512
N = 256
K = 128
alpha = 1.
beta = 2.
A = initialize(M, K, batch_count if batch_A else (1,))
B = initialize(K, N, batch_count if batch_B else (1,))
C = initialize(M, N, batch_count if batch_C else (1,))
D = initialize(M, N, batch_count)
plan = cutlass.op.Gemm(A=A, B=B, C=C, D=D, element_accumulator=cutlass.DataType.f32)
plan.run(A, B, C, D, alpha, beta)
reference = pytorch_reference(A, B, C, alpha, beta)
assert reference.equal(D)
def test_batched_ABC(self):
self.run_batched((3,), True, True, True)
self.run_batched((2, 3), True, True, True)
def test_batched_AB(self):
self.run_batched((3,), True, True, False)
self.run_batched((2, 3), True, True, False)
def test_batched_AC(self):
self.run_batched((3,), True, False, True)
self.run_batched((2, 3), True, False, True)
def test_batched_BC(self):
self.run_batched((3,), False, True, True)
self.run_batched((2, 3), False, True, True)
def test_batched_A(self):
self.run_batched((3,), True, False, False)
self.run_batched((2, 3), True, False, False)
def test_batched_B(self):
self.run_batched((3,), False, True, False)
self.run_batched((2, 3), False, True, False)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,125 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Low-level functionality tests for GEMM with F16 operands on SM80
"""
from functools import partial
import logging
import unittest
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 80
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
class GemmF16Sm80(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
class GemmF16Sm80StreamK(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f16, cc=cc, cluster_shape=[1, 1, 1])
# Tests using TensorOp
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 32], warp_count=[2, 1, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=5)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
# Tests using SIMT
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
# Stream K tests
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=5)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,140 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Low-level functionality tests for GEMM with F16 operands on SM90
"""
from functools import partial
import logging
import unittest
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 90
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
class GemmF16Sm90(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
add_test_specialized = partial(add_test_gemm, cls=GemmF16Sm90, element=cutlass.DataType.f16,
warp_count=None, compilation_modes=['nvcc'])
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
# Tests with 1x1x1 clusters
add_test_unit_cluster = partial(add_test_tensorop, cluster_shape=[1, 1, 1])
add_test_unit_cluster(layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=3)
add_test_unit_cluster(layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], stages=5)
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
# Tests with different cluster shapes
add_test_cluster_shape = partial(add_test_tensorop, threadblock_shape=[64, 128, 64], stages=None)
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, cluster_shape=[2, 2, 1])
add_test_cluster_shape(layouts=LayoutCombination.TNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
add_test_cluster_shape(layouts=LayoutCombination.NTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
add_test_cluster_shape(layouts=LayoutCombination.NNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[1, 4, 1])
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 4, 1])
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 1, 1])
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 2, 1])
# Tests for different schedule modes
add_test_schedule = partial(add_test_specialized, layouts=LayoutCombination.TTN, alignments=[8, 8, 4],
element_output=cutlass.DataType.f32, element_accumulator=cutlass.DataType.f32,
opclass=cutlass.OpcodeClass.TensorOp, threadblock_shape=[128, 128, 64], stages=None)
add_test_schedule(
cluster_shape=[1, 1, 1],
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
)
add_test_schedule(
cluster_shape=[1, 1, 1],
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
)
add_test_schedule(
cluster_shape=[2, 1, 1],
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
)
add_test_schedule(
cluster_shape=[2, 1, 1],
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
)
# Tests using SIMT
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt, alignments=[1, 1, 1], cluster_shape=[1, 1, 1], stages=2)
add_test_simt(layouts=LayoutCombination.NNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8])
add_test_simt(layouts=LayoutCombination.TNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8])
add_test_simt(layouts=LayoutCombination.NTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 8])
add_test_simt(layouts=LayoutCombination.TTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 8])
add_test_simt(layouts=LayoutCombination.NNT, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8])
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,100 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Low-level functionality tests for GEMM with F32 operands on SM80
"""
from functools import partial
import logging
import unittest
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 80
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
class GemmF32Sm80(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
class GemmF32Sm80StreamK(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f32, cc=cc, cluster_shape=[1, 1, 1])
# Tests using TensorOp
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 32], warp_count=[1, 1, 1], stages=4)
# Tests using SIMT
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
# Stream K tests
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
add_test_streamk(cls=GemmF32Sm80StreamK, layouts=LayoutCombination.TTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,99 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Low-level functionality tests for GEMM with F64 operands on SM80
"""
from functools import partial
import logging
import unittest
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 80
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
class GemmF64Sm80(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
class GemmF64Sm80StreamK(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f64, cc=cc, cluster_shape=[1, 1, 1])
# Tests using TensorOp
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 64, 16], warp_count=[2, 2, 1], stages=4)
add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 32, 32, 16], warp_count=[2, 1, 1], stages=5)
# Tests using SIMT
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
# Stream K tests
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
add_test_streamk(cls=GemmF64Sm80StreamK, layouts=LayoutCombination.NTT, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,69 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Low-level functionality tests for GEMM with F64 operands on SM90
"""
from functools import partial
import logging
import unittest
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 90
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
class GemmF64Sm90(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
add_test_specialized = partial(add_test_gemm, cls=GemmF64Sm90, alignments=[1, 1, 1], cluster_shape=[1, 1, 1],
element=cutlass.DataType.f64, element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, compilation_modes=['nvcc'])
add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.NNT, threadblock_shape=[128, 128, 32], stages=3)
add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.TNN, threadblock_shape=[128, 128, 32], stages=3)
add_test_specialized( opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.NNN, threadblock_shape=[128, 128, 8], stages=2)
add_test_specialized( opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.TTT, threadblock_shape=[ 64, 128, 8], stages=2)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,99 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Low-level functionality tests for GEMM with S8 operands on SM80
"""
from functools import partial
import logging
import unittest
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 80
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
class GemmS8Sm80(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
class GemmS8Sm80StreamK(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.s8, cc=cc, cluster_shape=[1, 1, 1])
# Tests using TensorOp
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[256, 128, 64], warp_count=[4, 2, 1], stages=3)
add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 4], element_output=cutlass.DataType.s32,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=4)
# Tests using SIMT
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.s32,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.s32,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
# Stream K tests
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
add_test_streamk(cls=GemmS8Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,95 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Low-level functionality tests for GEMM with S8 operands on SM90
"""
from functools import partial
import logging
import unittest
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 90
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
class GemmS8Sm90(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
add_test_specialized = partial(add_test_gemm, cls=GemmS8Sm90, element=cutlass.DataType.s8, compilation_modes=['nvcc'])
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
# Tests with 1x1x1 clusters
add_test_tensorop(layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=3)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 8], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 128, 128], stages=None)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 64, 32], stages=None)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[ 4, 4, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
# Tests with different cluster shapes
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 2, 1], threadblock_shape=[128, 128, 128], stages=None)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 4, 1], threadblock_shape=[128, 128, 128], stages=None)
# Tests with warp-specialized ping-pong schedule
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 1, 1], threadblock_shape=[128, 128, 128], stages=None,
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized)
# Tests for SIMT
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
add_test_simt(layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 32, 8], stages=2)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,387 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from math import prod
import os
import re
import subprocess
import torch
from cutlass import (
DataType,
DataTypeSize,
GemmUniversalMode,
LayoutType,
OpcodeClass,
ShortDataTypeNames,
SwizzlingFunctor
)
from cutlass.backend import compiler
from cutlass.backend.gemm_operation import GemmArguments, GemmOperationUniversal
from cutlass.backend.memory_manager import get_allocated_size
from cutlass.backend.reduction_operation import ReductionArguments, ReductionOperation
from cutlass.shape import GemmCoord, MatrixCoord
from cutlass.utils.datatypes import torch_type
class GemmUniversalLauncher:
def __init__(
self,
operation,
seed=2080,
verification=True,
iterations=500,
compiler_mode= "nvcc",
**kwargs,
) -> None:
# Create the reduction kernel, if needed
self.reduction_operation: ReductionOperation = ReductionOperation(
shape=MatrixCoord(4, 32 * operation.C.alignment),
C=operation.C,
element_accumulator=operation.tile_description.math_instruction.element_accumulator,
element_compute=operation.epilogue_functor.element_epilogue,
epilogue_functor=operation.epilogue_functor,
count=operation.C.alignment,
)
self.math_operation = operation.tile_description.math_instruction.math_operation
self.verification = verification
if compiler_mode == "nvcc":
compiler.nvcc()
elif compiler_mode == "nvrtc":
compiler.nvrtc()
else:
raise Exception(f"Unexpected compiler string {compiler_mode}")
op_list = [operation]
if operation.arch < 90:
# Split K via Python is currently only supported for pre-SM90 kernels
op_list.append(self.reduction_operation)
compiler.add_module(op_list, bypass_cache=False)
self.operation = operation
self.dtype_A = torch_type(operation.A.element)
self.dtype_B = torch_type(operation.B.element)
self.dtype_C = torch_type(operation.C.element)
self.dtype_D = torch_type(operation.C.element)
accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
element_size = DataTypeSize[operation.A.element]
if element_size == 1:
self.rand_max = 1
self.rand_min = 0
elif element_size <= 8:
self.rand_max = 1
self.rand_min = -1
elif element_size == 16:
self.rand_max = 4
self.rand_min = -4
else:
self.rand_max = 8
self.rand_min = -8
self.seed = seed
self.compute_type = operation.epilogue_functor.element_epilogue
self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
def print_problem_size(self, p, mode, batch_count):
if mode == GemmUniversalMode.Gemm:
mode = "Gemm"
elif mode == GemmUniversalMode.Batched:
mode = "GemmBatched"
elif mode == GemmUniversalMode.GemmSplitKParallel:
mode = "GemmSplitKParallel"
print(f"problem: {p.m}, {p.n}, {p.k}\n batch_count: {batch_count}\n mode: {mode}")
def uniform_init(self, shape, dtype, layout):
size = prod(shape)
if dtype.is_floating_point:
data = torch.ceil(torch.empty(size=(size,), dtype=dtype, device="cuda").uniform_(self.rand_min - 0.5, self.rand_max - 0.5))
else:
# PyTorch does not currently support integer-typed matrix multiplications on GPU.
# Fall back to CPU for integer type references.
data = torch.empty(size=(size,), dtype=dtype, device="cpu").random_(self.rand_min, self.rand_max + 1)
if dtype == torch.float64 or dtype == torch.float32:
data = data.to("cpu")
data_ref = data.reshape(shape)
if layout == LayoutType.RowMajor:
data_cutlass = data_ref
else:
data_cutlass = data_ref.transpose(-1, -2).contiguous()
data_cutlass = data_cutlass.to("cuda")
return data_cutlass, data_ref
def reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
# If any tensor is on CPU, place all tensors on CPU unless only
# tensor C is on CPU
devices = [x.device.type for x in [tensor_A, tensor_B, tensor_C]]
if "cpu" in devices and devices != ["cuda", "cuda", "cpu"]:
device = torch.device("cpu")
else:
device = tensor_A.device
tensor_A = tensor_A.to(device)
tensor_B = tensor_B.to(device)
tensor_C = tensor_C.to(device)
dtype = torch_type(self.compute_type)
alpha_torch = torch.tensor([alpha], device=device).to(dtype)
beta_torch = torch.tensor([beta], device=device).to(dtype)
tmp = tensor_A @ tensor_B
tensor_D_ref = (alpha_torch * tmp) + (tensor_C * beta_torch)
return tensor_D_ref.to(self.dtype_D)
def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
torch.random.manual_seed(self.seed)
# Assign an actual batch count in cases where we are not running in batched mode.
# This is to differentiate between the number of split K slices and the batch count,
# which are overloaded within the single `batch_count` variable.
if mode == GemmUniversalMode.Batched:
true_batch_count = batch_count
else:
true_batch_count = 1
def transpose(layout):
if layout == LayoutType.RowMajor:
return LayoutType.ColumnMajor
else:
return LayoutType.RowMajor
tensor_A, tensor_A_ref = self.uniform_init(
(true_batch_count, problem_size.m, problem_size.k),
self.dtype_A,
self.operation.A.layout if not self.operation.switched else transpose(self.operation.B.layout),
)
tensor_B, tensor_B_ref = self.uniform_init(
(true_batch_count, problem_size.k, problem_size.n),
self.dtype_B,
self.operation.B.layout if not self.operation.switched else transpose(self.operation.A.layout),
)
tensor_C, tensor_C_ref = self.uniform_init(
(true_batch_count, problem_size.m, problem_size.n),
self.dtype_C,
self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
)
tensor_D = torch.zeros_like(tensor_C)
if self.compute_type in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
alpha = int(alpha)
beta = int(beta)
#
# Launch kernel
#
arguments = GemmArguments(
operation=self.operation,
problem_size=problem_size,
A=tensor_A,
B=tensor_B,
C=tensor_C,
D=tensor_D,
output_op=self.operation.epilogue_type(alpha, beta),
gemm_mode=mode,
split_k_slices=split_k_slices,
batch=batch_count,
)
if mode == GemmUniversalMode.GemmSplitKParallel:
reduction_arguments = ReductionArguments(
self.reduction_operation,
problem_size=[problem_size.m, problem_size.n],
partitions=split_k_slices,
workspace=arguments.ptr_D,
destination=tensor_D,
source=tensor_C,
output_op=self.reduction_operation.epilogue_type(alpha, beta),
)
self.operation.run(arguments)
if mode == GemmUniversalMode.GemmSplitKParallel:
self.reduction_operation.run(reduction_arguments)
passed = True
if self.verification:
if mode == GemmUniversalMode.GemmSplitKParallel:
reduction_arguments.sync()
else:
arguments.sync()
tensor_D_ref = self.reference(
problem_size,
tensor_A_ref,
tensor_B_ref,
tensor_C_ref,
alpha,
beta,
)
tensor_D_ref = tensor_D_ref.to('cuda')
if self.operation.switched or self.operation.C.layout == LayoutType.ColumnMajor:
tensor_D = tensor_D.transpose(-1, -2).contiguous()
passed = tensor_D.equal(tensor_D_ref)
try:
assert passed
except AssertionError:
self.print_problem_size(problem_size, mode, batch_count)
del arguments
if mode == GemmUniversalMode.GemmSplitKParallel:
del reduction_arguments
cur_size = get_allocated_size()
assert cur_size == 0, f"{cur_size} B of memory were not released after this run"
return passed
def test_all_gemm(operation: "GemmOperationUniversal", testcase="universal", compilation_mode="nvcc"):
passed = True
minimum_operand_element_size = min(
DataTypeSize[operation.A.element], DataTypeSize[operation.B.element]
)
opcode_class = operation.tile_description.math_instruction.opcode_class
if opcode_class == OpcodeClass.Simt:
alignment = 1
else:
alignment = 128 // minimum_operand_element_size
alignment_m = alignment
alignment_n = alignment
alignment_k = alignment
# INT8 alignment constraints
if opcode_class == OpcodeClass.Simt:
A_is_s8 = operation.A.element == DataType.s8
B_is_s8 = operation.B.element == DataType.s8
if A_is_s8 and operation.A.layout == LayoutType.ColumnMajor:
alignment_m = 4
if B_is_s8 == DataType.s8 and operation.A.layout == LayoutType.RowMajor:
alignment_n = 4
if A_is_s8 and B_is_s8 and (operation.A.layout == LayoutType.RowMajor or operation.B.layout == LayoutType.ColumnMajor):
alignment_k = 4
threadblock_k = operation.tile_description.threadblock_shape[2]
assert testcase != "interleaved"
supports_split_k = operation.arch < 90 and not operation.swizzling_functor == SwizzlingFunctor.StreamK
if testcase == "multistage":
modes = [GemmUniversalMode.Gemm]
problem_size_m = [16, 528]
problem_size_n = [16, 528]
problem_size_k = [
threadblock_k,
threadblock_k * operation.tile_description.stages
+ operation.tile_description.math_instruction.instruction_shape[2],
]
problem_alpha = [1.0]
problem_beta = [0.0]
batch_counts = [1]
else:
modes = [GemmUniversalMode.Gemm]
batch_counts = [1, 2, 3, 5, 7]
if supports_split_k:
modes.append(GemmUniversalMode.GemmSplitKParallel)
problem_size_m = [alignment_m, 512 - 3 * alignment_m]
problem_size_n = [alignment_n, 512 - 2 * alignment_n]
if operation.tile_description.stages is None:
stages_for_k_calc = 7
else:
stages_for_k_calc = operation.tile_description.stages
problem_size_k = [
alignment_k,
threadblock_k * stages_for_k_calc - alignment_k,
threadblock_k * stages_for_k_calc * 3 - alignment_k,
]
problem_alpha = [1.0]
problem_beta = [2.0]
testbed = GemmUniversalLauncher(operation, compiler_mode=compilation_mode)
for mode in modes:
for m in problem_size_m:
for n in problem_size_n:
for k in problem_size_k:
for batch_count in batch_counts:
for alpha in problem_alpha:
for beta in problem_beta:
# skip very small K problems
if testcase == "universal":
if k // batch_count < 2 * threadblock_k:
continue
problem_size = GemmCoord(m, n, k)
if supports_split_k:
split_k_slices = batch_count
else:
split_k_slices = 1
overridden_mode = mode
if mode == GemmUniversalMode.Gemm and batch_count > 1:
overridden_mode = GemmUniversalMode.Batched
passed = testbed.run(
overridden_mode,
problem_size,
batch_count,
split_k_slices,
alpha,
beta,
)
if not passed:
return False
return passed

View File

@ -0,0 +1,44 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import pathlib
import unittest
if __name__ == '__main__':
loader = unittest.TestLoader()
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
tests = loader.discover(script_dir, 'gemm_*.py')
testRunner = unittest.runner.TextTestRunner()
results = testRunner.run(tests)
if not results.wasSuccessful():
raise Exception('Test cases failed')

View File

@ -0,0 +1,239 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import cutlass
from cutlass import (
DataTypeNames,
EpilogueScheduleSuffixes,
KernelScheduleSuffixes,
LayoutType,
OpcodeClassNames,
ShortDataTypeNames,
ShortLayoutTypeNames
)
from cutlass.backend import library
from cutlass.backend.utils.software import SubstituteTemplate
from gemm_testbed import test_all_gemm
class Layout:
"""
Utility class to map transpose and non-transpose terminology to row- and column-major terminology
"""
T = LayoutType.RowMajor
N = LayoutType.ColumnMajor
class LayoutCombination:
"""
Utility class defining all combinations of row- and column-major layouts for operands to a GEMMs
"""
NNN = (Layout.N, Layout.N, Layout.N)
NNT = (Layout.N, Layout.N, Layout.T)
NTN = (Layout.N, Layout.T, Layout.N)
NTT = (Layout.N, Layout.T, Layout.T)
TNN = (Layout.T, Layout.N, Layout.N)
TNT = (Layout.T, Layout.N, Layout.T)
TTN = (Layout.T, Layout.T, Layout.N)
TTT = (Layout.T, Layout.T, Layout.T)
def get_name(
layouts,
alignments,
element_output,
element_accumulator,
element_epilogue,
cluster_shape,
threadblock_shape,
stages,
element_a,
element_b,
arch,
opclass,
kernel_schedule=None,
epilogue_schedule=None,
suffix="",
):
"""
Generates a procedural name for a test case.
:param layouts: indexable container of layouts of A, B, and C operands
:param alignments: indexable container of alignments of A, B, and C operands
:param element_output: data type of the output element
:param element_accumulator: data type used in accumulation
:param element_epilogue: data type used in computing the epilogue
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
:param threadblock_shape: indexable container of dimensions of threadblock tiles
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param element_a: data type of operand A
:param element_b: data type of operand B
:param arch: compute capability of kernel being generated
:type arch: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpcodeClass
:param kernel_schedule: kernel_schedule type
:type kernel_schedule: cutlass.KernelScheduleType
:param epilogue_schedule: epilogue_schedule type
:type epilogue_schedule: cutlass.EpilogueScheduleType
:param suffix: additional string to add to the suffix of the name
:type suffix: str
:return: str
"""
name_format = "test_SM${arch}_Device_Gemm_${eA}${lA}_${eB}${lB}_${eC}${lC}_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${cM}x${cN}x${cK}_${stages}_align${aA}-${aB}-${aC}${k}${e}${suffix}"
return SubstituteTemplate(
name_format,
{
"arch": str(arch),
"eA": DataTypeNames[element_a],
"eB": DataTypeNames[element_b],
"eC": DataTypeNames[element_output],
"lA": ShortLayoutTypeNames[layouts[0]],
"lB": ShortLayoutTypeNames[layouts[1]],
"lC": ShortLayoutTypeNames[layouts[2]],
"opclass": OpcodeClassNames[opclass],
"acc": DataTypeNames[element_accumulator],
"cM": str(cluster_shape[0]),
"cN": str(cluster_shape[1]),
"cK": str(cluster_shape[2]),
"tbM": str(threadblock_shape[0]),
"tbN": str(threadblock_shape[1]),
"tbK": str(threadblock_shape[2]),
"stages": str(stages) if stages is not None else "auto",
"aA": str(alignments[0]),
"aB": str(alignments[1]),
"aC": str(alignments[2]),
"k": "" if kernel_schedule is None else KernelScheduleSuffixes[kernel_schedule],
"e": "" if epilogue_schedule is None else EpilogueScheduleSuffixes[epilogue_schedule],
"suffix": "" if suffix is None else suffix,
},
)
def add_test_gemm(
cls=None,
cc=None,
element=None,
layouts=None,
alignments=None,
element_output=None,
element_accumulator=None,
cluster_shape=None,
threadblock_shape=None,
warp_count=None,
stages=None,
opclass=None,
swizzle=None,
kernel_schedule=None,
epilogue_schedule=None,
compilation_modes=['nvcc', 'nvrtc']):
"""
Create test-running functions with the given specification and set it as a method of ``cls``.
:param cls: class to which the generated method will be added
:type cls: type
:param cc: compute capability to compile for
:type cc: int
:param element: data type of A and B operands
:type element: cutlass.DataType.f16
:param layouts: layouts of A, B, and C operands
:type layouts: list or tuple
:param alignments: alingments of A, B, and C operands
:type alignments: list or tuple
:param element_output: data type of the output element
:type element_output: cutlass.DataType
:param element_accumulator: data type used in accumulation
:type element_accumulator: cutlass.DataType
:param cluster_shape: dimensions of clusters
:type cluster_shape: list or tuple
:param threadblock_shape: dimensions of threadblock tiles
:type threadblock_shape: list or tuple
:param warp_count: warps to be launched per threadblock dimension
:type warp_count: list or tuple
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpcodeClass
:param swizzle: threadblock swizzling functor
:param kernel_schedule: kernel schedule to use
:type kernel_schedule: cutlass.KernelScheduleType
:param epilogue_schedule: epilogue schedule to use
:type epilogue_schedule: cutlass.EpilogueScheduleType
:param compilation_modes: list of compilers to used in testing the kernel (options: 'nvrtc', 'nvcc')
:type compilation_modes: list
"""
for compilation_mode in compilation_modes:
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = element
element_B = element
layout_A, layout_B, layout_C = layouts
alignment_A, alignment_B, alignment_C = alignments
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
element_C=element_output, element_D=element_output,
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
element_accumulator=element_accumulator,
kernel_cc=cc)
plan.opclass = opclass
if swizzle is not None:
plan.swizzling_functor = swizzle
td = plan.tile_descriptions()[0]
if warp_count is not None:
td.warp_count = warp_count
td.threadblock_shape = threadblock_shape
td.stages = stages
td.cluster_shape = cluster_shape
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
self.assertTrue(test_all_gemm(op, 'universal', compilation_mode=compilation_mode))
element_epilogue = element_accumulator
name = get_name(
layouts=layouts, alignments=alignments, element_output=element_output, element_accumulator=element_accumulator,
element_epilogue=element_epilogue, cluster_shape=cluster_shape, threadblock_shape=threadblock_shape,
stages=stages, element_a=element, element_b=element, arch=cc, opclass=opclass,
kernel_schedule=kernel_schedule, epilogue_schedule=epilogue_schedule, suffix=f'_{compilation_mode}')
setattr(cls, name, run)

View File

@ -0,0 +1,284 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Tests the high-level Conv2d interface
"""
from math import ceil
import unittest
import cutlass
import cutlass.utils.datatypes as datatypes
from cutlass.backend.utils.device import device_cc
from utils import ExpectException
import os
class Conv2dEquivalence:
"""
Helper class for testing the equivalence of different constructions of the Conv2d interface
"""
def __init__(self, conv_kind, element_A, element_B, element_C, element_D, element_accumulator,
alignment_A, alignment_B, alignment_C):
self.element_A = element_A
self.element_B = element_B
self.element_C = element_C
self.element_D = element_D
self.element_accumulator = element_accumulator
self.alignment_A = alignment_A
self.alignment_B = alignment_B
self.alignment_C = alignment_C
self.conv_kind = conv_kind
self.plan = cutlass.op.Conv2d(
kind=self.conv_kind, element_A=element_A, element_B=element_B, element_C=element_C,
element_D=element_D, element_accumulator=element_accumulator)
self.op = self.plan.construct(
alignment_A=self.alignment_A, alignment_B=self.alignment_B,
alignment_C=self.alignment_C)
def _plans_equal(self, other_plan) -> bool:
"""
Compares whether two plans are equal
:param other_plan: plan to compare against the default Conv2d
:type other_plan: cutlass.op.Conv2d
:return: whether `other_plan` is equivalent to `self.plan`
:rtype: bool
"""
other_op = other_plan.construct(
alignment_A=self.alignment_A, alignment_B=self.alignment_B,
alignment_C=self.alignment_C)
return self.op.rt_module.emit() == other_op.rt_module.emit()
def generic_test(self):
"""
Tests the equivalence of various constructions of the Conv2d interface when using CUTLASS data types
and layouts for constructing the Conv2d interface
"""
if not datatypes.numpy_available:
return
# Test when specifying all parameters
plan_other = cutlass.op.Conv2d(
kind=self.conv_kind,
element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
element_D=self.element_D, element_accumulator=self.element_accumulator)
assert self._plans_equal(plan_other)
# Test when specifying all parameters but A
plan_other = cutlass.op.Conv2d(
kind=self.conv_kind,
element_B=self.element_B, element_C=self.element_C,
element_D=self.element_D, element_accumulator=self.element_accumulator,
element=self.element_A)
assert self._plans_equal(plan_other)
# Test when specifying all parameters but A and B as tensors using generic element and output
plan_other = cutlass.op.Conv2d(
kind=self.conv_kind,
element_C=self.element_C,
element_D=self.element_D, element_accumulator=self.element_accumulator,
element=self.element_A)
assert self._plans_equal(plan_other)
# Test without explicit accumulator. Only run if the type of C and the accumulator are equal
if self.element_C == self.element_accumulator:
plan_other = cutlass.op.Conv2d(
kind=self.conv_kind,
element_C=self.element_C,
element_D=self.element_D,
element=self.element_A)
assert self._plans_equal(plan_other)
# Test with only the generic types. Only rune if the types of A, B, C, and D are the same
if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
and self.element_A == self.element_accumulator):
plan_other = cutlass.op.Conv2d(kind=self.conv_kind, element=self.element_A)
assert self._plans_equal(plan_other)
def numpy_test(self):
"""
Tests the equivalence of various constructions of the Conv2d interface when using numpy as a frontend
"""
if not datatypes.numpy_available:
return
import numpy as np
type_A = datatypes.numpy_type(self.element_A)
type_B = datatypes.numpy_type(self.element_B)
type_C = datatypes.numpy_type(self.element_C)
type_D = datatypes.numpy_type(self.element_D)
type_accum = datatypes.numpy_type(self.element_accumulator)
size = (2, 2)
A = np.zeros(size, dtype=type_A)
B = np.zeros(size, dtype=type_B)
C = np.zeros(size, dtype=type_C)
D = np.zeros(size, dtype=type_D)
return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
def torch_test(self):
"""
Tests the equivalence of various constructions of the Conv2d interface when using torch as a frontend
"""
if not datatypes.torch_available:
return
import torch
type_A = datatypes.torch_type(self.element_A)
type_B = datatypes.torch_type(self.element_B)
type_C = datatypes.torch_type(self.element_C)
type_D = datatypes.torch_type(self.element_D)
type_accum = datatypes.torch_type(self.element_accumulator)
size = (2, 2)
A = torch.empty(size, dtype=type_A)
B = torch.empty(size, dtype=type_B)
C = torch.empty(size, dtype=type_C)
D = torch.empty(size, dtype=type_D)
return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
def tensor_test(self, type_A, type_B, type_C, type_D, type_accum, A, B, C, D):
# Test when specifying all parameters via tensors
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D, element_accumulator=type_accum)
assert self._plans_equal(plan_np)
# Test when specifying all parameters but A as tensors
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A)
assert self._plans_equal(plan_np)
# Test when specifying all parameters but A and B as tensors and using generic element and output
if type_A == type_B:
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, C=C, D=D, element_accumulator=type_accum, element=type_A)
assert self._plans_equal(plan_np)
# Test without explicit accumulator. Only run if the type of C and the accumulator.
if type_C == type_accum:
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D)
assert self._plans_equal(plan_np)
# Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum):
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, element=type_A)
assert self._plans_equal(plan_np)
def test_all(self):
"""
Runs all tests on the Gemm interface
"""
self.generic_test()
self.numpy_test()
self.torch_test()
@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
class ConvEquivalenceTest(unittest.TestCase):
"""
Tests the equivalence of different constructions of the Conv2d interface
"""
pass
type2alignment = {
cutlass.DataType.f16: 8,
cutlass.DataType.f32: 4
}
def add_test(conv_kind, element_A, element_B, element_C, element_D, element_accumulator):
test_name = f"test_conv2d_{conv_kind}_{element_A}_{element_B}_{element_C}_{element_D}_{element_accumulator}"
def run(self):
conv2d_eq = Conv2dEquivalence(
conv_kind=conv_kind,
element_A=element_A, element_B=element_B,
element_C=element_C, element_D=element_D,
element_accumulator=element_accumulator,
alignment_A=type2alignment[element_A], alignment_B=type2alignment[element_B],
alignment_C=type2alignment[element_C]
)
conv2d_eq.test_all()
setattr(ConvEquivalenceTest, test_name, run)
for conv_kind in ["fprop", "wgrad", "dgrad"]:
for types in [
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16],
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32],
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f16],
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32],
[cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32]
]:
add_test(conv_kind, types[0], types[1], types[2], types[3], types[4])
@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
class Conv2dErrorTests(unittest.TestCase):
"""
Tests various error scenarios that arise with the high-level Gemm interface
"""
def test_alignment(self):
"""
Tests case in which the alignment specified is unsupported
"""
plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
with ExpectException(True, 'Alignment 3 is not supported for F16. The construction should fail.'):
op = plan.construct(alignment_A=3, alignment_B=3, alignment_C=3)
def test_invalid_tile_description(self):
"""
Tests scenarios in which an invalid tile description is provided for a given CC
"""
plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
td = plan.tile_descriptions()[0]
td.threadblock_shape=[17, 32, 5]
plan.tile_description = td
with ExpectException(True, 'The threadblock shape is invalid. The compilation should fail.'):
plan.compile()
# Clean up the error message
os.remove("./cutlass_python_compilation_device_error.txt")
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,245 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Test the EVT interface
"""
import numpy as np
import unittest
import cutlass
from cutlass import LayoutType, Tensor
from cutlass.backend.utils.device import device_cc
from cutlass.epilogue import reshape, permute
from utils import ExpectException
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
class EVTErrorTests(unittest.TestCase):
"""
Tests various error scenarios that arise with the EVT interface
"""
@unittest.skipIf(device_cc() != 90, "Only Sm90 EVT requires root node be 'D'")
def test_root_not_d(self):
"""
Test when "D" does not exist in Sm90 EVT
"""
def evt_root_not_d(accum, alpha):
F = accum * alpha
return F
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"alpha": 1.2,
"F": self.fake_tensor(np.float16, (6, 512, 512))
}
with ExpectException(device_cc() == 90,
"SyntaxError: Sm90 EVT requires the epilogue to have a returned tensor D, "
"but the variable 'D' is not found in the return values.", True):
cutlass.epilogue.trace(evt_root_not_d, example_tensors)
def test_no_accum(self):
"""
Test when "accum" is not in input arguments
"""
def evt_no_accum(alpha, C):
D = alpha * C
return D
example_tensors = {
"C": self.fake_tensor(np.float16, (6, 512, 512)),
"alpha": 1.2,
"D": self.fake_tensor(np.float16, (6, 512, 512))
}
with ExpectException(True, "SyntaxError: Cannot find 'accum' in the argument list.", True):
cutlass.epilogue.trace(evt_no_accum, example_tensors)
@unittest.skipIf(device_cc() != 90, "Only Sm90 EVT has concern on smem size")
def test_too_much_shared_memory(self):
"""
Test when the epilogue consumes too much shared memory
"""
def evt_too_much_shared_memory(accum, C1, C2, C3, C4, C5):
D1 = accum + C1
D2 = D1 + C2
D3 = D2 + C3
D4 = D3 + C4
D = D4 + C5
return D, D1, D2, D3, D4
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"C1": self.fake_tensor(np.float16, (6, 512, 512)),
"C2": self.fake_tensor(np.float16, (6, 512, 512)),
"C3": self.fake_tensor(np.float16, (6, 512, 512)),
"C4": self.fake_tensor(np.float16, (6, 512, 512)),
"C5": self.fake_tensor(np.float16, (6, 512, 512)),
"D1": self.fake_tensor(np.float16, (6, 512, 512)),
"D2": self.fake_tensor(np.float16, (6, 512, 512)),
"D3": self.fake_tensor(np.float16, (6, 512, 512)),
"D4": self.fake_tensor(np.float16, (6, 512, 512)),
"D": self.fake_tensor(np.float16, (6, 512, 512))
}
epilogue_visitor = cutlass.epilogue.trace(evt_too_much_shared_memory, example_tensors)
plan = cutlass.op.Gemm(
element=np.float16, layout=cutlass.LayoutType.RowMajor,
element_accumulator=np.float32
)
with ExpectException(True,
"RuntimeError: The epilogue consumes too much shared memory. "
"No valid tile description is found in the generator.", True):
plan.epilogue_visitor = epilogue_visitor
def test_not_ssa(self):
"""
Test when the epilogue is not in SSA
"""
def evt_redefine(accum, C, alpha):
F = accum + C
F = F * alpha
D = F
return D, F
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"C": self.fake_tensor(np.float16, (6, 512, 512)),
"alpha": 1.5,
"D": self.fake_tensor(np.float16, (6, 512, 512)),
"F": self.fake_tensor(np.float16, (6, 512, 512))
}
with ExpectException(True, "SyntaxError: Variable 'F' cannot be defined twice.", True):
cutlass.epilogue.trace(evt_redefine, example_tensors)
def evt_undefine(accum, alpha):
F = accum + C
D = F * alpha
return D, F
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"alpha": 1.5,
"D": self.fake_tensor(np.float16, (6, 512, 512)),
"F": self.fake_tensor(np.float16, (6, 512, 512))
}
with ExpectException(True, "SyntaxError: Variable 'C' is undefined.", True):
cutlass.epilogue.trace(evt_undefine, example_tensors)
def test_missing_example_tensor(self):
"""
Test when the example tensor of an input/output variable is not provided
"""
def evt_missing_example_tensor(accum, C):
D = accum + C
return D
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"C": self.fake_tensor(np.float16, (6, 512, 512)),
}
with ExpectException(True, "RuntimeError: Example input for D is not provided.", True):
cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"D": self.fake_tensor(np.float16, (6, 512, 512)),
}
with ExpectException(True, "RuntimeError: Example input for C is not provided.", True):
cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
def test_return_expression(self):
"""
Test when the return value is an expression
"""
def evt_return_expr(accum, C):
return accum + C
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"C": self.fake_tensor(np.float16, (6, 512, 512)),
}
with ExpectException(True, "SyntaxError: Return value cannot be an expression", True):
cutlass.epilogue.trace(evt_return_expr, example_tensors)
def test_incompatible_shape(self):
"""
Test when the shape of example tensors are incompatible
"""
def evt_incompatible_shape(accum, C):
D = accum + C
return D
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 256, 512)),
"C": self.fake_tensor(np.float16, (6, 512, 512)),
"D": self.fake_tensor(np.float16, (6, 512, 512))
}
with ExpectException(True,
"RuntimeError: Dimension mismatch between accum(6, 256, 512), C(6, 512, 512).", True):
cutlass.epilogue.trace(evt_incompatible_shape, example_tensors)
def test_no_matching_impl(self):
def evt_no_matching_impl(accum, bias):
D = accum + reshape(permute(bias, indices=(1, 0)), new_shape=(512, 1))
return D
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 256)),
"bias": self.fake_tensor(np.float16, (16, 32)),
"D": self.fake_tensor(np.float16, (6, 512, 256))
}
with ExpectException(True, "NotImplementedError: No matching op for node bias with stride (0, (1, 32), 0).", True):
cutlass.epilogue.trace(evt_no_matching_impl, example_tensors)
#
# Helper functions
#
def fake_tensor(self, element, shape):
return Tensor(element=element, shape=shape, layout_tag=LayoutType.RowMajor)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,351 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Tests the high-level GEMM interface
"""
from math import ceil
import unittest
import cutlass
import cutlass.utils.datatypes as datatypes
from cutlass.backend.utils.device import device_cc
from utils import ExpectException
class GemmEquivalence:
"""
Helper class for testing the equivalence of different constructions of the Gemm interface
"""
def __init__(self, element_A, element_B, element_C, element_D, element_accumulator,
layout_A, layout_B, layout_C, alignment_A, alignment_B, alignment_C):
self.element_A = element_A
self.element_B = element_B
self.element_C = element_C
self.element_D = element_D
self.element_accumulator = element_accumulator
self.layout_A = layout_A
self.layout_B = layout_B
self.layout_C = layout_C
self.alignment_A = alignment_A
self.alignment_B = alignment_B
self.alignment_C = alignment_C
self.plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B, element_C=element_C,
element_D=element_D, element_accumulator=element_accumulator,
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C)
self.op = self.plan.construct(alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
def _plans_equal(self, other_plan) -> bool:
"""
Compares whether two plans are equal
:param other_plan: plan to compare against the default GEMM
:type other_plan: cutlass.op.Gemm
:return: whether `other_plan` is equivalent to `self.plan`
:rtype: bool
"""
other_op = other_plan.construct(alignment_A=self.alignment_A, alignment_B=self.alignment_B, alignment_C=self.alignment_C)
# Compare whether the operations are equal by comparing the C++ code that would be emitted for them
return self.op.rt_module.emit() == other_op.rt_module.emit()
def generic_test(self):
"""
Tests the equivalence of various constructions of the Gemm interface when using CUTLASS data types
and layouts for constructing the Gemm interface
"""
if not datatypes.numpy_available:
return
# Test when specifying all parameters
plan_other = cutlass.op.Gemm(element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
element_D=self.element_D, element_accumulator=self.element_accumulator,
layout_A=self.layout_A, layout_B=self.layout_B, layout_C=self.layout_C)
assert self._plans_equal(plan_other)
# Test when specifying all parameters but A
plan_other = cutlass.op.Gemm(element_B=self.element_B, element_C=self.element_C,
element_D=self.element_D, element_accumulator=self.element_accumulator,
layout_B=self.layout_B, layout_C=self.layout_C,
element=self.element_A, layout=self.layout_A)
assert self._plans_equal(plan_other)
# Test when specifying all parameters but A and B as tensors and using generic element and output
# Only run this test if the layouts and types for A and B are equal.
if self.element_A == self.element_B and self.layout_A == self.layout_B:
plan_other = cutlass.op.Gemm(element_C=self.element_C, element_D=self.element_D, element_accumulator=self.element_accumulator,
layout_C=self.layout_C, element=self.element_A, layout=self.layout_A)
assert self._plans_equal(plan_other)
# Test without explicit accumulator. Only run if the type of C and the accumulator.
if self.element_C == self.element_accumulator:
plan_other = cutlass.op.Gemm(element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
element_D=self.element_D, layout_A=self.layout_A, layout_B=self.layout_B,
layout_C=self.layout_C)
assert self._plans_equal(plan_other)
# Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
and self.element_A == self.element_accumulator and
self.layout_A == self.layout_B and self.layout_A == self.layout_C):
plan_other = cutlass.op.Gemm(element=self.element_A, layout=self.layout_A)
assert self._plans_equal(plan_other)
def numpy_test(self):
"""
Tests the equivalence of various constructions of the Gemm interface when using numpy as a frontend
"""
if not datatypes.numpy_available:
return
import numpy as np
type_A = datatypes.numpy_type(self.element_A)
type_B = datatypes.numpy_type(self.element_B)
type_C = datatypes.numpy_type(self.element_C)
type_D = datatypes.numpy_type(self.element_D)
type_accum = datatypes.numpy_type(self.element_accumulator)
layout_to_order = {
cutlass.LayoutType.RowMajor: 'C',
cutlass.LayoutType.ColumnMajor: 'F'
}
size = (2, 2)
A = np.zeros(size, order=layout_to_order[self.layout_A], dtype=type_A)
B = np.zeros(size, order=layout_to_order[self.layout_B], dtype=type_B)
C = np.zeros(size, order=layout_to_order[self.layout_C], dtype=type_C)
D = np.zeros(size, order=layout_to_order[self.layout_C], dtype=type_D)
# Test when specifying all parameters via tensors
plan_np = cutlass.op.Gemm(A=A, B=B, C=C, D=D, element_accumulator=type_accum)
assert self._plans_equal(plan_np)
# Test when specifying all parameters but A as tensors
plan_np = cutlass.op.Gemm(B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A, layout_A=self.layout_A)
assert self._plans_equal(plan_np)
# Test when specifying all parameters but A and B as tensors and using generic element and output
# Only run this test if the layouts and types for A and B are equal.
if type_A == type_B and self.layout_A == self.layout_B:
plan_np = cutlass.op.Gemm(C=C, D=D, element_accumulator=type_accum, element=type_A, layout=self.layout_A)
assert self._plans_equal(plan_np)
# Test without explicit accumulator. Only run if the type of C and the accumulator.
if type_C == type_accum:
plan_np = cutlass.op.Gemm(A=A, B=B, C=C, D=D)
assert self._plans_equal(plan_np)
# Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum and
self.layout_A == self.layout_B and self.layout_A == self.layout_C):
plan_np = cutlass.op.Gemm(element=type_A, layout=self.layout_A)
assert self._plans_equal(plan_np)
def test_all(self):
"""
Runs all tests on the Gemm interface
"""
self.generic_test()
self.numpy_test()
class GemmEquivalenceTest(unittest.TestCase):
"""
Tests the equivalence of different constructions of the Gemm interface
"""
@unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
def test_gemm_equivalence_f16_f16_f16_f16_f16_ttt_8_8_8(self):
gemm_eq = GemmEquivalence(
element_A=cutlass.DataType.f16, element_B=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
element_D=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16,
layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.RowMajor,
alignment_A=8, alignment_B=8, alignment_C=8)
gemm_eq.test_all()
@unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
def test_gemm_equivalence_f16_f16_f16_f16_f32_ntn_8_8_8(self):
gemm_eq = GemmEquivalence(
element_A=cutlass.DataType.f16, element_B=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
element_D=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32,
layout_A=cutlass.LayoutType.ColumnMajor, layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.ColumnMajor,
alignment_A=8, alignment_B=8, alignment_C=8)
gemm_eq.test_all()
@unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
def test_gemm_equivalence_f16_f16_f16_f16_f16_ttt_4_4_4(self):
gemm_eq = GemmEquivalence(
element_A=cutlass.DataType.f16, element_B=cutlass.DataType.f16, element_C=cutlass.DataType.f16,
element_D=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16,
layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.RowMajor, layout_C=cutlass.LayoutType.RowMajor,
alignment_A=8, alignment_B=8, alignment_C=8)
gemm_eq.test_all()
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for F64 Tensor Core tests.")
def test_gemm_equivalence_f64_f64_f64_f64_f64_tnt_1_1_1(self):
gemm_eq = GemmEquivalence(
element_A=cutlass.DataType.f64, element_B=cutlass.DataType.f64, element_C=cutlass.DataType.f64,
element_D=cutlass.DataType.f64, element_accumulator=cutlass.DataType.f64,
layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.ColumnMajor, layout_C=cutlass.LayoutType.RowMajor,
alignment_A=1, alignment_B=1, alignment_C=1)
gemm_eq.test_all()
class GemmErrorTests(unittest.TestCase):
"""
Tests various error scenarios that arise with the high-level Gemm interface
"""
def test_alignment(self):
"""
Tests case in which the alignment specified is unsupported
"""
plan = cutlass.op.Gemm(element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
with ExpectException(True, 'Alignment 16 is not supported for F16. The construction should fail.'):
op = plan.construct(alignment_A=16, alignment_B=16, alignment_C=16)
def test_tensorop_availability(self):
"""
Tests case in which only SIMT operations are available but TensorOp is requested
"""
cc = device_cc()
# F64 Tensor Core operations are only avaiable on devices with CC >= 80
supports_tensorop_f64 = cc >= 80
plan = cutlass.op.Gemm(cc=cc, element=cutlass.DataType.f64, layout=cutlass.LayoutType.RowMajor)
error_msg = f'Incorrectly raised an exception for availability of TensorOp with F64 operands on SM{cc}'
with ExpectException(not supports_tensorop_f64, error_msg):
plan.opclass = cutlass.OpcodeClass.TensorOp
expected_opclass = cutlass.OpcodeClass.TensorOp if supports_tensorop_f64 else cutlass.OpcodeClass.Simt
assert plan.opclass == expected_opclass, f'Expected opclass to be {expected_opclass}, but received {plan.opclass} for SM{cc}'
@unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for F16 Tensor Core tests.")
def test_opclass_switch(self):
"""
Tests cases in which the opcode class in question is switched (e.g., from TensorOp to SIMT)
"""
plan = cutlass.op.Gemm( element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
assert plan.opclass == cutlass.OpcodeClass.TensorOp
# Ensure that all tile descriptions have opclass of TensorOp
for td in plan.tile_descriptions():
assert td.math_instruction.opcode_class == cutlass.OpcodeClass.TensorOp
plan.opclass = cutlass.OpcodeClass.Simt
# Ensure that all tile descriptions have opclass of Simt
for td in plan.tile_descriptions():
assert td.math_instruction.opcode_class == cutlass.OpcodeClass.Simt
def test_invalid_tile_description(self):
"""
Tests scenarios in which an invalid tile description is provided for a given CC
"""
cc = device_cc()
plan = cutlass.op.Gemm(cc=cc, element=cutlass.DataType.f16, layout=cutlass.LayoutType.RowMajor)
td = plan.tile_descriptions()[0]
stages = td.stages
# Zero stage count is valid for SM90+, as this is used to indicate that the builder's auto stage
# count should be used
with ExpectException(cc < 90, f'Requested zero stages'):
td.stages = 0
plan.construct(td)
if cc < 90:
with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
td.stages = 3
plan.construct(td)
else:
original_kschedule = td.kernel_schedule
original_eschedule = td.epilogue_schedule
with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
td.epilogue_schedule = cutlass.EpilogueScheduleType.NoSmemWarpSpecialized
td.stages = 3
plan.construct(td)
# Reset schedules
td.kernel_schedule = original_kschedule
td.epilogue_schedule = original_eschedule
with ExpectException(True, f'Requested too many stages'):
td.stages = 100
plan.construct(td)
# Reset stage count
td.stages = stages
cluster_shape = td.cluster_shape
with ExpectException(cc < 90, f'Requested non-unit cluster shape on SM{cc}'):
td.cluster_shape = [2, 1, 1]
plan.construct(td)
# Reset cluster shape
td.cluster_shape = cluster_shape
with ExpectException(cc < 90, f'Requested a non-auto schedule on SM{cc}'):
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
plan.construct(td)
with ExpectException(True, f'Requested a non-auto kernel schedule with an auto epilogue schedule'):
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
td.epilogue_schedule = cutlass.EpilogueScheduleType.ScheduleAuto
plan.construct(td)
with ExpectException(True, f'Requested an auto kernel schedule with a non-auto epilogue schedule'):
td.kernel_schedule = cutlass.KernelScheduleType.ScheduleAuto
td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
plan.construct(td)
with ExpectException(cc < 90, f'Requested a tile scheduler on SM{cc}'):
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative
td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
td.tile_scheduler = cutlass.TileSchedulerType.StreamK
plan.construct(td)
# Ensure that all returned tile descriptions are unique
ops = {}
for i, td in enumerate(plan.tile_descriptions()):
op = plan.construct(td)
code_str = op.rt_module.emit()
if code_str in ops:
conflicting_td = ops[code_str]
assert False, f'Multiple tile descriptions emitted {code_str}\nTile descriptions are:\n{td}\n{conflicting_td}'
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,69 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Helper functions & classes for interface test
"""
class ExpectException:
"""
Utility class to assert that an exception was raised when expected
Example:
.. highlight:: python
.. code-block:: python
with ExceptionExpected(True, 'Division by zero'):
x = 1.0 / 0.0
:param exception_expected: whether an exception is expected to be raised
:type exception_expected: bool
:param message: message to print if an exception is raised when not expected or vice versa
:type message: str
"""
def __init__(self, exception_expected: bool, message: str = '', verify_msg=False):
self.exception_expected = exception_expected
self.message = message
self.verify_msg = verify_msg
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, traceback):
exception_raised = exc_type is not None
assert self.exception_expected == exception_raised, self.message
if self.verify_msg:
exc_message = f"{exc_type.__name__}: {exc_val}"
assert exc_message == self.message, f"expect error message {self.message}, got {exc_message}"
# Suppress the exception
return True