CUTLASS 3.2 (#1024)

* CUTLASS 3.2
This commit is contained in:
ANIKET SHIVAM
2023-08-07 14:50:32 -10:00
committed by GitHub
parent a0d787b746
commit 4575443d44
392 changed files with 47559 additions and 7940 deletions

View File

@ -0,0 +1,138 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Low-level functionality tests for Conv2d operands on SM80
"""
from conv2d_test_utils import *
import cutlass
import logging
cutlass.set_log_level(logging.WARNING)
cc = 80
@unittest.skipIf(device_cc() != cc, 'Device compute capability is invalid for SM80 tests.')
class Conv2dSm80(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
conv_problems = get_conv_problems()
# Tests for optimized & analytic
for conv_kind in ["fprop", "wgrad", "dgrad"]:
# F16, simt
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="simt", threadblock_shape=[128, 128, 8],
warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
# F16, tensor op
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
# F16, tensor op, analytic iterator
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
# F16, tensor op, f32 output
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
# F16, tensor op, different tile description
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 64, 32],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
# F32, simt
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
opclass="simt", threadblock_shape=[128, 128, 8],
warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
# Tf32, tensorop
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
opclass="tensor_op", threadblock_shape=[128, 128, 16],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8]
)
# Split-K
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
split_k_slices=2)
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
split_k_slices=5)
# Swizzling functor
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 64, 32],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)
# Tests for few channels and fixed channels
# F16, tensor op, few channels
for c, tb, stage, inst in zip([2, 1],
[[128, 128, 64], [128, 128, 32]],
[3, 2],
[[16, 8, 16], [16, 8, 8]]):
add_test(
Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=tb,
warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
)
# F16, tensor op, fixed channels
for c in [8, 4, 2]:
add_test(
Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
)
# Test activations
for activation in ["relu", "leaky_relu"]:
for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
add_test(
Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
split_k_slices=split_k_slices, activation=activation)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,508 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Util Functions for Conv2d Test
"""
import torch
import cutlass
import unittest
import cutlass_bindings
from cutlass.utils.datatypes import binding_type, binding_opclass
from cutlass.backend.test.conv2d_testbed import Conv2dLauncher, getTensorRef, getTensorView
from cutlass.backend.utils.device import device_cc
from cutlass.backend.test.utils import get_name_conv2d
import numpy as np
def conv2d_few_channel_problemsizes(channels):
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
cutlass_bindings.Tensor4DCoord(16, 7, 7, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
return problem_sizes
torch_dtype = {
cutlass.DataType.f16: torch.float16,
cutlass.DataType.f32: torch.float32,
cutlass.DataType.f64: torch.float64
}
numpy_dtype = {
cutlass.DataType.f16: np.float16,
cutlass.DataType.f32: np.float32,
cutlass.DataType.f64: np.float64
}
def validate_problem_size(ps, conv_kind, split_k_slices):
P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
if P != ps.P or Q != ps.Q:
return False
# Split-K (serial or parallel) is not supported for strided dgrad
if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
return False
return True
# Override the backend launcher
class Conv2dLauncherFrontend(Conv2dLauncher):
def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
self.operation = plan
self.conv_kind = plan.conv_kind
self.seed = seed
self.backend = backend
self.dtype_A = plan._element_a
self.dtype_B = plan._element_b
self.dtype_C = plan._element_c
self.dtype_acc = plan._element_accumulator
self.layout_A = cutlass_bindings.TensorNHWC
self.layout_B = cutlass_bindings.TensorNHWC
self.layout_C = cutlass_bindings.TensorNHWC
self.layout_D = cutlass_bindings.TensorNHWC
self.element_compute = cutlass_bindings.float32
self.enable_cached_results = True
# Get randomization_max
if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
if self.dtype_acc in [cutlass.DataType.f16, cutlass.DataType.bf16]:
self.randomization_max = 2
else:
self.randomization_max = 3
else:
self.randomization_max = 7
self.activation = plan.activation
self.host_conv2d = cutlass_bindings.test.conv.host.conv2d
def set_seed(self):
if self.backend == "numpy":
np.random.seed(self.seed)
else:
torch.manual_seed(self.seed)
def uniform_init(self, size, dtype):
if self.backend == "numpy":
return super().uniform_init(size, numpy_dtype[dtype])
else:
tensor = torch.ceil(
torch.empty(size=size, dtype=torch_dtype[dtype], device="cuda").uniform_(-self.randomization_max - 0.5, self.randomization_max - 0.5)
).to(memory_format=torch.channels_last)
return tensor
def zeros_like(self, tensor):
if self.backend == "numpy":
return np.zeros_like(tensor)
else:
return torch.zeros_like(tensor).to(memory_format=torch.channels_last)
def reference(self, ps, A, B, C, alpha, beta, activation):
if self.backend == "numpy":
numpy_result = self.host_reference(ps, A, B, C, alpha, beta, activation)
return numpy_result
else:
if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
torch_result = alpha * torch.ops.aten.conv2d(
A,
B,
stride=(ps.stride_h, ps.stride_w),
padding=(ps.pad_h, ps.pad_w),
dilation=(ps.dilation_h, ps.dilation_w)
) + beta * C
elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
torch_result = alpha * torch.nn.grad.conv2d_input(
(ps.N, ps.C, ps.H, ps.W),
B,
A,
padding=(ps.pad_h, ps.pad_w),
stride=(ps.stride_h, ps.stride_w)
) + beta * C
elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
torch_result = alpha * torch.nn.grad.conv2d_weight(
B,
(ps.K, ps.C, ps.R, ps.S),
A,
padding=(ps.pad_h, ps.pad_w),
stride=(ps.stride_h, ps.stride_w)
) + beta * C
else:
raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
if activation == cutlass.backend.epilogue.relu:
torch_result = torch.nn.functional.relu(torch_result)
elif activation == cutlass.backend.epilogue.leaky_relu:
torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
return torch_result
def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta, activation):
if self.element_compute == cutlass_bindings.float16:
alpha = cutlass_bindings.float16(alpha)
beta = cutlass_bindings.float16(beta)
elif self.element_compute == cutlass_bindings.int32:
alpha = int(alpha)
beta = int(beta)
else:
alpha = alpha
beta = beta
# If cached result is loaded
cached_result_loaded = False
if self.enable_cached_results:
# Get problem key
cached_test_key = cutlass_bindings.test.conv.host.CreateCachedConv2dTestKey(
self.conv_kind,
problem_size,
alpha,
beta,
getTensorView(
tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
),
getTensorView(
tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
),
getTensorView(
tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
),
)
cached_test_key.problem = cached_test_key.problem + f"_{activation.tag.split('::')[-1]}"
cached_test_result = cutlass_bindings.test.conv.host.CachedTestResult()
conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (
self.operation.arch,
self.seed,
)
cached_results = cutlass_bindings.test.conv.host.CachedTestResultListing(
conv2d_result_cache_name
)
# CachedTestResultListing cached_results(conv2d_result_cache_name);
cached = cached_results.find(cached_test_key)
cached_result_loaded = cached[0]
if cached_result_loaded:
cached_test_result = cached[1]
if not cached_result_loaded:
# Compute the conv2d on host
tensor_D_ref = np.ones_like(tensor_C)
tensor_ref_A = getTensorRef(
tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
)
tensor_ref_B = getTensorRef(
tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
)
tensor_ref_C = getTensorRef(
tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
)
tensor_ref_D_ref = getTensorRef(
tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
)
self.host_conv2d(
self.conv_kind,
problem_size,
tensor_ref_A,
tensor_ref_B,
tensor_ref_C,
tensor_ref_D_ref,
alpha,
beta,
)
if activation == cutlass.backend.epilogue.leaky_relu:
tensor_D_ref = activation.numpy(tensor_D_ref, 0.5)
else:
tensor_D_ref = activation.numpy(tensor_D_ref)
tensor_view_D_ref = getTensorView(
tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
)
if self.enable_cached_results:
cached_test_result.D = cutlass_bindings.test.conv.host.TensorHash(
tensor_view_D_ref
)
cached_results = (
cutlass_bindings.test.conv.host.CachedTestResultListing(
conv2d_result_cache_name
)
)
cached_results.append(cached_test_key, cached_test_result)
cached_results.write(conv2d_result_cache_name)
else:
return tensor_D_ref
return cached_test_result.D
def equal(self, tensor_D, tensor_D_ref, problem_size):
if self.backend == "numpy":
return super().equal(tensor_D, tensor_D_ref, problem_size)
else:
torch.cuda.synchronize()
return torch.equal(tensor_D, tensor_D_ref)
def run(self, ps, split_k_mode=cutlass_bindings.conv.SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
#
# Initialize input and output tensors
#
if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
if self.backend == "torch":
tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
else:
tensor_A_size = (ps.N, ps.H, ps.W, ps.C)
tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
tensor_C_size = (ps.N, ps.P, ps.Q, ps.K)
elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
if self.backend == "torch":
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
else:
tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
tensor_C_size = (ps.N, ps.H, ps.W, ps.C)
elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
if self.backend == "torch":
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
else:
tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
tensor_B_size = (ps.N, ps.H, ps.W, ps.C)
tensor_C_size = (ps.K, ps.R, ps.S, ps.C)
else:
raise Exception(f"Conv kind {self.conv_kind} is not supported")
self.set_seed()
tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
tensor_D = self.zeros_like(tensor_C)
self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
stride=(ps.stride_h, ps.stride_w),
padding=(ps.pad_h, ps.pad_w),
dilation=(ps.dilation_h, ps.dilation_w),
alpha=alpha, beta=beta,
split_k=(split_k_mode, split_k_slices))
tensor_D_ref = self.reference(
ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation
)
return self.equal(tensor_D, tensor_D_ref, ps)
def add_test(
cls,
cc,
conv_kind,
problem_sizes,
element,
element_accumulator,
element_output,
opclass,
threadblock_shape,
warp_count,
instruction_shape,
stages,
iterator_algorithm=None,
swizzle=None,
split_k_mode="serial",
split_k_slices=1,
activation = "identity"
):
"""Create a test-running function with the given specification"""
test_name = get_name_conv2d(
cc, conv_kind, element, element_accumulator,
element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
def run(self):
# Create the plan
plan = cutlass.Conv2d(
kind=conv_kind,
element=element,
element_accumulator=element_accumulator,
element_C=element_output,
element_D=element_output
)
# Set the opclass
plan.opclass = opclass
# Set the tile description
td = {
"threadblock_shape": threadblock_shape,
"warp_count": warp_count,
"stages": stages,
"instruction_shape": instruction_shape,
}
plan.tile_description = td
# Set iterator algorithm
if iterator_algorithm is not None:
plan.iterator_algorithm = iterator_algorithm
# Set swizzling functor
if swizzle is not None:
plan.swizzling_stride = swizzle
if activation != "identity":
if activation == "leaky_relu":
plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
else:
plan.activation = getattr(cutlass.epilogue, activation)
conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="numpy")
for ps in problem_sizes:
if not validate_problem_size(ps, conv_kind, split_k_slices): continue
self.assertTrue(
conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 0.5)
)
setattr(cls, test_name, run)
return run
def get_conv_problems():
# 64: minimum channel size
conv_problems = list(cutlass_bindings.test.conv.TestbedConv2dProblemSizes(64).conv2d_default_sizes)
# Insert alignment 4 & 2 tests
conv_problems += [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
return conv_problems

View File

@ -0,0 +1,42 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import unittest
if __name__ == '__main__':
loader = unittest.TestLoader()
tests = loader.discover('./', 'conv2d_*.py')
testRunner = unittest.runner.TextTestRunner()
results = testRunner.run(tests)
if not results.wasSuccessful():
raise Exception('Test cases failed')

View File

@ -39,6 +39,7 @@ import tempfile
import unittest
import cutlass
import cutlass_bindings
if cutlass.utils.datatypes.torch_available:
import torch
@ -85,6 +86,34 @@ def _generate_problems(dtype, num):
Ds.append(D)
return As, Bs, Cs, Ds
def _generate_conv2d_problem(conv_kind, dtype, ps):
"""
Utility function to generate conv2d inputs
:param conv_kind: kind of convolution
:type conv_kind: str
:param dtype: data type of tensors
:param problem_size: the conv2d problem size
:type problem_size: cutlass_bindings.conv.Conv2dProblemSize
:return: initialized tensors A, B, C, and D
:rtype: list
"""
if conv_kind == "fprop":
tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
elif conv_kind == "dgrad":
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
else:
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
sizes = [tensor_A_size, tensor_B_size, tensor_C_size]
return [torch.ceil(torch.empty(size, dtype=dtype, device='cuda').uniform_(-4.5, 3.5)).to(memory_format=torch.channels_last) for size in sizes]
@unittest.skipIf(not cutlass.utils.datatypes.torch_available, 'PyTorch must be available to run PyTorch extension tests')
class PyTorchExtensionTest(unittest.TestCase):
@ -155,6 +184,127 @@ class PyTorchExtensionTest(unittest.TestCase):
Ds_ref = [(a @ b) * alpha + (beta * c) for a, b, c in zip(As, Bs, Cs)]
Ds = mod.run(As, Bs, Cs, alpha, beta)
check_all(Ds, Ds_ref)
def test_conv2d_fprop(self):
torch.manual_seed(2023)
dtype = torch.float16
plan = cutlass.op.Conv2d(kind="fprop", element=dtype, element_accumulator=torch.float32)
plan.activation = "relu"
op = plan.construct()
with tempfile.TemporaryDirectory() as tmpdir:
mod = cutlass.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
problem_size = cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
)
A, B, C = _generate_conv2d_problem("fprop", dtype, problem_size)
stride = (problem_size.stride_h, problem_size.stride_w)
padding = (problem_size.pad_h, problem_size.pad_w)
alpha = 1.0
beta = 0.5
D_ref = alpha * torch.ops.aten.conv2d(
A, B, stride=stride, padding=padding
) + beta * C
D_ref = torch.nn.functional.relu(D_ref)
D = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta)
assert torch.allclose(D, D_ref)
# Test serial split-K
D_serial_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
assert torch.allclose(D, D_serial_split_k)
# Test parallel split-K
D_parallel_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
assert torch.allclose(D, D_parallel_split_k)
def test_conv2d_dgrad(self):
torch.manual_seed(2023)
dtype = torch.float16
plan = cutlass.op.Conv2d(kind="dgrad", element=dtype, element_accumulator=torch.float32)
op = plan.construct()
with tempfile.TemporaryDirectory() as tmpdir:
mod = cutlass.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
problem_size = cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
)
A, B, C = _generate_conv2d_problem("dgrad", dtype, problem_size)
stride = (problem_size.stride_h, problem_size.stride_w)
padding = (problem_size.pad_h, problem_size.pad_w)
alpha = 1.0
beta = 0.5
input_size = (problem_size.N, problem_size.C, problem_size.H, problem_size.W)
D_ref = alpha * torch.nn.grad.conv2d_input(
input_size, B, A,
stride=stride, padding=padding
) + beta * C
D = mod.run(input_size, A, B, C, stride, padding, alpha=alpha, beta=beta, )
assert torch.allclose(D, D_ref)
def test_conv2d_wgrad(self):
torch.manual_seed(2023)
dtype = torch.float16
plan = cutlass.op.Conv2d(kind="wgrad", element=dtype, element_accumulator=torch.float32)
op = plan.construct()
with tempfile.TemporaryDirectory() as tmpdir:
mod = cutlass.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
problem_size = cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
)
A, B, C = _generate_conv2d_problem("wgrad", dtype, problem_size)
stride = (problem_size.stride_h, problem_size.stride_w)
padding = (problem_size.pad_h, problem_size.pad_w)
alpha = 1.0
beta = 0.5
weight_size = (problem_size.K, problem_size.C, problem_size.R, problem_size.S)
D_ref = alpha * torch.nn.grad.conv2d_weight(
B, weight_size, A,
stride=stride, padding=padding
) + beta * C
D = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta)
assert torch.allclose(D, D_ref)
# Test serial split-K
D_serial_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
assert torch.allclose(D, D_serial_split_k)
# Test parallel split-K
D_parallel_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
assert torch.allclose(D, D_parallel_split_k)
if __name__ == '__main__':

View File

@ -37,82 +37,16 @@ Low-level functionality tests for GEMM with F16 operands on SM80
from functools import partial
import cutlass
from cutlass.utils.datatypes import binding_opclass, binding_type
from cutlass.backend.test.gemm_testbed import test_all_gemm
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, get_name
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
from cutlass.backend.utils.device import device_cc
cutlass.set_log_level(logging.WARNING)
cc = 80
# Partial specialziation for naming tests
bound_type = binding_type(cutlass.DataType.f16)
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
def add_test(cls, layouts, alignments, element_output, element_accumulator,
threadblock_shape, warp_count, stages, opclass, swizzle=None):
"""
Create a test-running function with the given specification and set it as a method of `cls`.
:param cls: class to which the generated method will be added
:type cls: type
:param layouts: layouts of A, B, and C operands
:type layouts: list or tuple
:param alignments: alingments of A, B, and C operands
:type alignments: list or tuple
:param element_output: data type of the output element
:type element_output: cutlass.DataType
:param element_accumulator: data type used in accumulation
:type element_accumulator: cutlass.DataType
:param threadblock_shape: dimensions of threadblock tiles
:type threadblock_shape: list or tuple
:param warp_count: warps to be launched per threadblock dimension
:type warp_count: list or tuple
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpClass
:param swizzle: threadblock swizzling functor
"""
cluster_shape = [1, 1, 1]
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = cutlass.DataType.f16
element_B = cutlass.DataType.f16
layout_A, layout_B, layout_C = layouts
alignment_A, alignment_B, alignment_C = alignments
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
element_C=element_output, element_D=element_output,
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
element_accumulator=element_accumulator,
kernel_cc=cc)
plan.opclass = opclass
if swizzle is not None:
plan.swizzling_functor = swizzle
td = plan.tile_descriptions()[0]
td.threadblock_shape = threadblock_shape
td.stages = stages
td.warp_count = warp_count
td.cluster_shape = cluster_shape
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
self.assertTrue(test_all_gemm(op, 'universal'))
element_epilogue = element_accumulator
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages, opclass=binding_opclass(opclass))
setattr(cls, name, run)
return run
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
class GemmF16Sm80(unittest.TestCase):
"""
@ -128,40 +62,64 @@ class GemmF16Sm80StreamK(unittest.TestCase):
"""
pass
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f16, cc=cc, cluster_shape=[1, 1, 1])
# Tests using TensorOp
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
add_test_tensorop(GemmF16Sm80, LayoutCombination.NNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.NNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.NTN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.NTT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.TTN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.TTT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [64, 128, 32], [1, 2, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 64, 32], [2, 1, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [64, 64, 64], [1, 1, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [4, 4, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [4, 4, 8], cutlass.DataType.f16, cutlass.DataType.f16, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f16, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [64, 64, 64], [1, 1, 1], 5)
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [2, 2, 2], cutlass.DataType.f16, cutlass.DataType.f16, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 32], warp_count=[2, 1, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=5)
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
# Tests using SIMT
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
add_test_simt(GemmF16Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 8], [2, 2, 1], 2)
add_test_simt(GemmF16Sm80, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [64, 128, 8], [1, 2, 1], 2)
add_test_simt(GemmF16Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [128, 64, 8], [2, 1, 1], 2)
add_test_simt(GemmF16Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [64, 64, 8], [1, 1, 1], 2)
add_test_simt(GemmF16Sm80, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f16, [128, 128, 8], [2, 2, 1], 2)
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
# Stream K tests
add_test_streamk = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
add_test_streamk(GemmF16Sm80StreamK, LayoutCombination.NNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_streamk(GemmF16Sm80StreamK, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [64, 64, 64], [1, 1, 1], 5)
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=5)
if __name__ == '__main__':
unittest.main()

View File

@ -37,87 +37,16 @@ Low-level functionality tests for GEMM with F16 operands on SM90
from functools import partial
import cutlass
from cutlass.utils.datatypes import binding_opclass, binding_type
from cutlass.backend.test.gemm_testbed import test_all_gemm
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, get_name
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
from cutlass.backend.utils.device import device_cc
cutlass.set_log_level(logging.WARNING)
cc = 90
# Partial specialziation for naming tests
bound_type = binding_type(cutlass.DataType.f16)
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
def add_test(cls, layouts, alignments, element_output, element_accumulator,
cluster_shape, threadblock_shape, stages, opclass,
kernel_schedule=cutlass.KernelScheduleType.ScheduleAuto,
swizzle=None):
"""
Create a test-running function with the given specification and set it as a method of `cls`.
:param cls: class to which the generated method will be added
:type cls: type
:param layouts: layouts of A, B, and C operands
:type layouts: list or tuple
:param alignments: alingments of A, B, and C operands
:type alignments: list or tuple
:param element_output: data type of the output element
:type element_output: cutlass.DataType
:param element_accumulator: data type used in accumulation
:type element_accumulator: cutlass.DataType
:param cluster_shape: dimensions of threadblock cluster
:type cluster_shape: list or tuple
:param threadblock_shape: dimensions of threadblock tiles
:type threadblock_shape: list or tuple
:param warp_count: warps to be launched per threadblock dimension
:type warp_count: list or tuple
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpClass
:param kernel_schedule: kernel schedule type
:type kernel_schedule: cutlass.KernelScheduleType
:param swizzle: threadblock swizzling functor
"""
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = cutlass.DataType.f16
element_B = cutlass.DataType.f16
layout_A, layout_B, layout_C = layouts
alignment_A, alignment_B, alignment_C = alignments
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
element_C=element_output, element_D=element_output,
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
element_accumulator=element_accumulator)
plan.opclass = opclass
if swizzle is not None:
plan.swizzling_functor = swizzle
td = plan.tile_descriptions()[0]
td.threadblock_shape = threadblock_shape
td.stages = stages
td.cluster_shape = cluster_shape
td.kernel_schedule = kernel_schedule
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
self.assertTrue(test_all_gemm(op, 'universal'))
element_epilogue = element_accumulator
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages,
opclass=binding_opclass(opclass), kernel_schedule=kernel_schedule)
setattr(cls, name, run)
return run
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
class GemmF16Sm90(unittest.TestCase):
"""
@ -126,47 +55,85 @@ class GemmF16Sm90(unittest.TestCase):
pass
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
add_test_specialized = partial(add_test_gemm, cls=GemmF16Sm90, element=cutlass.DataType.f16,
warp_count=None, compilation_modes=['nvcc'])
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
# Tests with 1x1x1 clusters
add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], 3)
add_test_tensorop(GemmF16Sm90, LayoutCombination.NNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.NTT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass.DataType.f16, cutlass.DataType.f16, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f16, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [64, 64, 64], 5)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [2, 2, 2], cutlass.DataType.f16, cutlass.DataType.f16, [1, 1, 1], [128, 128, 32], None)
add_test_unit_cluster = partial(add_test_tensorop, cluster_shape=[1, 1, 1])
add_test_unit_cluster(layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=3)
add_test_unit_cluster(layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], stages=5)
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
# Tests with different cluster shapes
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f16, [2, 2, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [2, 2, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [2, 2, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [2, 2, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [1, 4, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [2, 4, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [4, 1, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [4, 2, 1], [64, 128, 64], None)
add_test_cluster_shape = partial(add_test_tensorop, threadblock_shape=[64, 128, 64], stages=None)
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
element_accumulator=cutlass.DataType.f16, cluster_shape=[2, 2, 1])
add_test_cluster_shape(layouts=LayoutCombination.TNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
add_test_cluster_shape(layouts=LayoutCombination.NTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
add_test_cluster_shape(layouts=LayoutCombination.NNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[1, 4, 1])
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 4, 1])
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 1, 1])
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 2, 1])
# Tests for different schedule modes
add_test_schedule = partial(add_test, GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, opclass=cutlass.OpcodeClass.TensorOp)
add_test_schedule([1, 1, 1], [128, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong)
add_test_schedule([1, 1, 1], [128, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative)
add_test_schedule([2, 1, 1], [128, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong)
add_test_schedule([2, 1, 1], [128, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative)
add_test_schedule([2, 1, 1], [256, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative)
add_test_schedule([2, 1, 1], [128, 128, 64], 5, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong)
add_test_schedule([2, 1, 1], [128, 128, 64], 5, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative)
add_test_schedule = partial(add_test_specialized, layouts=LayoutCombination.TTN, alignments=[8, 8, 4],
element_output=cutlass.DataType.f32, element_accumulator=cutlass.DataType.f32,
opclass=cutlass.OpcodeClass.TensorOp, threadblock_shape=[128, 128, 64], stages=None)
add_test_schedule(
cluster_shape=[1, 1, 1],
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
)
add_test_schedule(
cluster_shape=[1, 1, 1],
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
)
add_test_schedule(
cluster_shape=[2, 1, 1],
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
)
add_test_schedule(
cluster_shape=[2, 1, 1],
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
)
# Tests using SIMT
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
add_test_simt(GemmF16Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 8], 2)
add_test_simt(GemmF16Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [64, 128, 8], 2)
add_test_simt(GemmF16Sm90, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 64, 8], 2)
add_test_simt(GemmF16Sm90, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [64, 64, 8], 2)
add_test_simt(GemmF16Sm90, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f16, [1, 1, 1], [128, 128, 8], 2)
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt, alignments=[1, 1, 1], cluster_shape=[1, 1, 1], stages=2)
add_test_simt(layouts=LayoutCombination.NNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8])
add_test_simt(layouts=LayoutCombination.TNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8])
add_test_simt(layouts=LayoutCombination.NTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 8])
add_test_simt(layouts=LayoutCombination.TTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 8])
add_test_simt(layouts=LayoutCombination.NNT, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8])
if __name__ == '__main__':

View File

@ -37,83 +37,16 @@ Low-level functionality tests for GEMM with F32 operands on SM80
from functools import partial
import cutlass
from cutlass.utils.datatypes import binding_opclass, binding_type
from cutlass.backend.test.gemm_testbed import test_all_gemm
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, get_name
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
from cutlass.backend.utils.device import device_cc
cutlass.set_log_level(logging.WARNING)
cc = 80
# Partial specialziation for naming tests
bound_type = binding_type(cutlass.DataType.f32)
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
def add_test(cls, layouts, alignments, element_output, element_accumulator,
threadblock_shape, warp_count, stages, opclass, swizzle=None):
"""
Create a test-running function with the given specification and set it as a method of `cls`.
:param cls: class to which the generated method will be added
:type cls: type
:param layouts: layouts of A, B, and C operands
:type layouts: list or tuple
:param alignments: alingments of A, B, and C operands
:type alignments: list or tuple
:param element_output: data type of the output element
:type element_output: cutlass.DataType
:param element_accumulator: data type used in accumulation
:type element_accumulator: cutlass.DataType
:param threadblock_shape: dimensions of threadblock tiles
:type threadblock_shape: list or tuple
:param warp_count: warps to be launched per threadblock dimension
:type warp_count: list or tuple
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpClass
:param swizzle: threadblock swizzling functor
"""
cluster_shape = [1, 1, 1]
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = cutlass.DataType.f32
element_B = cutlass.DataType.f32
layout_A, layout_B, layout_C = layouts
alignment_A, alignment_B, alignment_C = alignments
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
element_C=element_output, element_D=element_output,
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
element_accumulator=element_accumulator,
kernel_cc=cc)
plan.opclass = opclass
if swizzle is not None:
plan.swizzling_functor = swizzle
td = plan.tile_descriptions()[0]
td.threadblock_shape = threadblock_shape
td.stages = stages
td.warp_count = warp_count
td.cluster_shape = cluster_shape
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
self.assertTrue(test_all_gemm(op, 'universal'))
element_epilogue = element_accumulator
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages, opclass=binding_opclass(opclass))
setattr(cls, name, run)
return run
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
class GemmF32Sm80(unittest.TestCase):
"""
@ -130,25 +63,37 @@ class GemmF32Sm80StreamK(unittest.TestCase):
pass
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f32, cc=cc, cluster_shape=[1, 1, 1])
# Tests using TensorOp
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
add_test_tensorop(GemmF32Sm80, LayoutCombination.NNN, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF32Sm80, LayoutCombination.NNT, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_tensorop(GemmF32Sm80, LayoutCombination.NTN, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [64, 128, 32], [1, 2, 1], 3)
add_test_tensorop(GemmF32Sm80, LayoutCombination.NTN, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [64, 64, 32], [1, 1, 1], 4)
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 32], warp_count=[1, 1, 1], stages=4)
# Tests using SIMT
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
add_test_simt(GemmF32Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 8], [2, 2, 1], 2)
add_test_simt(GemmF32Sm80, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [64, 128, 8], [1, 2, 1], 2)
add_test_simt(GemmF32Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [128, 64, 8], [2, 1, 1], 2)
add_test_simt(GemmF32Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [64, 64, 8], [1, 1, 1], 2)
add_test_simt(GemmF32Sm80, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 8], [2, 2, 1], 2)
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
# Stream K tests
add_test_streamk = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
add_test_streamk(GemmF32Sm80StreamK, LayoutCombination.TTN, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
add_test_streamk(cls=GemmF32Sm80StreamK, layouts=LayoutCombination.TTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
if __name__ == '__main__':

View File

@ -37,84 +37,16 @@ Low-level functionality tests for GEMM with F64 operands on SM80
from functools import partial
import cutlass
from cutlass.utils.datatypes import binding_opclass, binding_type
from cutlass.backend.test.gemm_testbed import test_all_gemm
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, get_name
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
from cutlass.backend.utils.device import device_cc
cutlass.set_log_level(logging.WARNING)
cc = 80
# Partial specialziation for naming tests
bound_type = binding_type(cutlass.DataType.f64)
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
def add_test(cls, layouts, alignments, element_output, element_accumulator,
threadblock_shape, warp_count, stages, opclass, swizzle=None):
"""
Create a test-running function with the given specification and set it as a method of `cls`.
:param cls: class to which the generated method will be added
:type cls: type
:param layouts: layouts of A, B, and C operands
:type layouts: list or tuple
:param alignments: alingments of A, B, and C operands
:type alignments: list or tuple
:param element_output: data type of the output element
:type element_output: cutlass.DataType
:param element_accumulator: data type used in accumulation
:type element_accumulator: cutlass.DataType
:param threadblock_shape: dimensions of threadblock tiles
:type threadblock_shape: list or tuple
:param warp_count: warps to be launched per threadblock dimension
:type warp_count: list or tuple
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpClass
:param swizzle: threadblock swizzling functor
"""
cluster_shape = [1, 1, 1]
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = cutlass.DataType.f64
element_B = cutlass.DataType.f64
layout_A, layout_B, layout_C = layouts
alignment_A, alignment_B, alignment_C = alignments
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
element_C=element_output, element_D=element_output,
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
element_accumulator=element_accumulator,
kernel_cc=cc)
plan.opclass = opclass
if swizzle is not None:
plan.swizzling_functor = swizzle
td = plan.tile_descriptions()[0]
td.threadblock_shape = threadblock_shape
td.stages = stages
td.warp_count = warp_count
td.cluster_shape = cluster_shape
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
self.assertTrue(test_all_gemm(op, 'universal'))
element_epilogue = element_accumulator
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages, opclass=binding_opclass(opclass))
setattr(cls, name, run)
return run
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
class GemmF64Sm80(unittest.TestCase):
"""
@ -131,25 +63,36 @@ class GemmF64Sm80StreamK(unittest.TestCase):
pass
# Tests using TensorOp
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f64, cc=cc, cluster_shape=[1, 1, 1])
add_test_tensorop(GemmF64Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 128, 16], [4, 2, 1], 3)
add_test_tensorop(GemmF64Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [64, 64, 16], [2, 2, 1], 4)
add_test_tensorop(GemmF64Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [32, 32, 16], [2, 1, 1], 5)
# Tests using TensorOp
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 64, 16], warp_count=[2, 2, 1], stages=4)
add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 32, 32, 16], warp_count=[2, 1, 1], stages=5)
# Tests using SIMT
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
add_test_simt(GemmF64Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 128, 8], [2, 2, 1], 2)
add_test_simt(GemmF64Sm80, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [64, 128, 8], [1, 2, 1], 2)
add_test_simt(GemmF64Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 64, 8], [2, 1, 1], 2)
add_test_simt(GemmF64Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [64, 64, 8], [1, 1, 1], 2)
add_test_simt(GemmF64Sm80, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 128, 8], [2, 2, 1], 2)
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
# Stream K tests
add_test_streamk = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
add_test_streamk(GemmF64Sm80StreamK, LayoutCombination.NTT, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 128, 16], [4, 2, 1], 3)
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
add_test_streamk(cls=GemmF64Sm80StreamK, layouts=LayoutCombination.NTT, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
if __name__ == '__main__':

View File

@ -37,90 +37,16 @@ Low-level functionality tests for GEMM with F64 operands on SM90
from functools import partial
import cutlass
from cutlass.utils.datatypes import binding_opclass, binding_type
from cutlass.backend.test.gemm_testbed import test_all_gemm
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, get_name
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
from cutlass.backend.utils.device import device_cc
cutlass.set_log_level(logging.WARNING)
cc = 90
# Partial specialziation for naming tests
bound_type = binding_type(cutlass.DataType.f64)
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
def add_test(cls, layouts, alignments, element_output, element_accumulator,
cluster_shape, threadblock_shape, stages, opclass, persistent=False, swizzle=None):
"""
Create a test-running function with the given specification and set it as a method of `cls`.
:param cls: class to which the generated method will be added
:type cls: type
:param layouts: layouts of A, B, and C operands
:type layouts: list or tuple
:param alignments: alingments of A, B, and C operands
:type alignments: list or tuple
:param element_output: data type of the output element
:type element_output: cutlass.DataType
:param element_accumulator: data type used in accumulation
:type element_accumulator: cutlass.DataType
:param cluster_shape: dimensions of threadblock cluster
:type cluster_shape: list or tuple
:param threadblock_shape: dimensions of threadblock tiles
:type threadblock_shape: list or tuple
:param warp_count: warps to be launched per threadblock dimension
:type warp_count: list or tuple
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpClass
:param persistent: whether this is a persistent warp-specialized kernel
:type persistent: bool
:param swizzle: threadblock swizzling functor
"""
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = cutlass.DataType.f64
element_B = cutlass.DataType.f64
layout_A, layout_B, layout_C = layouts
alignment_A, alignment_B, alignment_C = alignments
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
element_C=element_output, element_D=element_output,
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
element_accumulator=element_accumulator)
plan.opclass = opclass
if swizzle is not None:
plan.swizzling_functor = swizzle
td = plan.tile_descriptions()[0]
td.threadblock_shape = threadblock_shape
td.stages = stages
td.cluster_shape = cluster_shape
td.persistent = persistent
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
self.assertTrue(test_all_gemm(op, 'universal'))
if persistent:
suffix = "_persistent"
else:
suffix = ""
element_epilogue = element_accumulator
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages,
opclass=binding_opclass(opclass), suffix=suffix)
setattr(cls, name, run)
return run
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
class GemmF64Sm90(unittest.TestCase):
"""
@ -129,13 +55,14 @@ class GemmF64Sm90(unittest.TestCase):
pass
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
add_test_specialized = partial(add_test_gemm, cls=GemmF64Sm90, alignments=[1, 1, 1], cluster_shape=[1, 1, 1],
element=cutlass.DataType.f64, element_output=cutlass.DataType.f64,
element_accumulator=cutlass.DataType.f64, compilation_modes=['nvcc'])
add_test_tensorop(GemmF64Sm90, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [1, 1, 1], [128, 128, 32], 3)
add_test_tensorop(GemmF64Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [1, 1, 1], [128, 128, 32], 3)
add_test_simt(GemmF64Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [1, 1, 1], [128, 128, 8], 2)
add_test_simt(GemmF64Sm90, LayoutCombination.TTT, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [1, 1, 1], [64, 128, 8], 2)
add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.NNT, threadblock_shape=[128, 128, 32], stages=3)
add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.TNN, threadblock_shape=[128, 128, 32], stages=3)
add_test_specialized( opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.NNN, threadblock_shape=[128, 128, 8], stages=2)
add_test_specialized( opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.TTT, threadblock_shape=[ 64, 128, 8], stages=2)
if __name__ == '__main__':

View File

@ -37,84 +37,16 @@ Low-level functionality tests for GEMM with S8 operands on SM80
from functools import partial
import cutlass
from cutlass.utils.datatypes import binding_opclass, binding_type
from cutlass.backend.test.gemm_testbed import test_all_gemm
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, get_name
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
from cutlass.backend.utils.device import device_cc
cutlass.set_log_level(logging.WARNING)
cc = 80
# Partial specialziation for naming tests
bound_type = binding_type(cutlass.DataType.s8)
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
def add_test(cls, layouts, alignments, element_output, element_accumulator,
threadblock_shape, warp_count, stages, opclass, swizzle=None):
"""
Create a test-running function with the given specification and set it as a method of `cls`.
:param cls: class to which the generated method will be added
:type cls: type
:param layouts: layouts of A, B, and C operands
:type layouts: list or tuple
:param alignments: alingments of A, B, and C operands
:type alignments: list or tuple
:param element_output: data type of the output element
:type element_output: cutlass.DataType
:param element_accumulator: data type used in accumulation
:type element_accumulator: cutlass.DataType
:param threadblock_shape: dimensions of threadblock tiles
:type threadblock_shape: list or tuple
:param warp_count: warps to be launched per threadblock dimension
:type warp_count: list or tuple
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpClass
:param swizzle: threadblock swizzling functor
"""
cluster_shape = [1, 1, 1]
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = cutlass.DataType.s8
element_B = cutlass.DataType.s8
layout_A, layout_B, layout_C = layouts
alignment_A, alignment_B, alignment_C = alignments
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
element_C=element_output, element_D=element_output,
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
element_accumulator=element_accumulator,
kernel_cc=cc)
plan.opclass = opclass
if swizzle is not None:
plan.swizzling_functor = swizzle
td = plan.tile_descriptions()[0]
td.threadblock_shape = threadblock_shape
td.stages = stages
td.warp_count = warp_count
td.cluster_shape = cluster_shape
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
self.assertTrue(test_all_gemm(op, 'universal'))
element_epilogue = element_accumulator
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages, opclass=binding_opclass(opclass))
setattr(cls, name, run)
return run
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
class GemmS8Sm80(unittest.TestCase):
"""
@ -131,25 +63,36 @@ class GemmS8Sm80StreamK(unittest.TestCase):
pass
# Tests using TensorOp
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.s8, cc=cc, cluster_shape=[1, 1, 1])
add_test_tensorop(GemmS8Sm80, LayoutCombination.TNN, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [256, 128, 64], [4, 2, 1], 3)
add_test_tensorop(GemmS8Sm80, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [128, 256, 64], [2, 4, 1], 3)
add_test_tensorop(GemmS8Sm80, LayoutCombination.TNN, [16, 16, 4], cutlass.DataType.s32, cutlass.DataType.s32, [64, 64, 64], [1, 1, 1], 4)
# Tests using TensorOp
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[256, 128, 64], warp_count=[4, 2, 1], stages=3)
add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 4], element_output=cutlass.DataType.s32,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=4)
# Tests using SIMT
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
add_test_simt(GemmS8Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.s8, cutlass.DataType.s32, [128, 128, 8], [2, 2, 1], 2)
add_test_simt(GemmS8Sm80, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.s8, cutlass.DataType.s32, [64, 128, 8], [1, 2, 1], 2)
add_test_simt(GemmS8Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.s8, cutlass.DataType.s32, [128, 64, 8], [2, 1, 1], 2)
add_test_simt(GemmS8Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.s32, cutlass.DataType.s32, [64, 64, 8], [1, 1, 1], 2)
add_test_simt(GemmS8Sm80, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.s32, cutlass.DataType.s32, [128, 128, 8], [2, 2, 1], 2)
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.s32,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.s32,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
# Stream K tests
add_test_streamk = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
add_test_streamk(GemmS8Sm80StreamK, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [128, 256, 64], [2, 4, 1], 3)
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
add_test_streamk(cls=GemmS8Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
if __name__ == '__main__':

View File

@ -37,90 +37,16 @@ Low-level functionality tests for GEMM with S8 operands on SM90
from functools import partial
import cutlass
from cutlass.utils.datatypes import binding_opclass, binding_type
from cutlass.backend.test.gemm_testbed import test_all_gemm
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, get_name
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
from cutlass.backend.utils.device import device_cc
cutlass.set_log_level(logging.WARNING)
cc = 90
# Partial specialziation for naming tests
bound_type = binding_type(cutlass.DataType.s8)
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
def add_test(cls, layouts, alignments, element_output, element_accumulator,
cluster_shape, threadblock_shape, stages, opclass, persistent=False, swizzle=None):
"""
Create a test-running function with the given specification and set it as a method of `cls`.
:param cls: class to which the generated method will be added
:type cls: type
:param layouts: layouts of A, B, and C operands
:type layouts: list or tuple
:param alignments: alingments of A, B, and C operands
:type alignments: list or tuple
:param element_output: data type of the output element
:type element_output: cutlass.DataType
:param element_accumulator: data type used in accumulation
:type element_accumulator: cutlass.DataType
:param cluster_shape: dimensions of threadblock cluster
:type cluster_shape: list or tuple
:param threadblock_shape: dimensions of threadblock tiles
:type threadblock_shape: list or tuple
:param warp_count: warps to be launched per threadblock dimension
:type warp_count: list or tuple
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpClass
:param persistent: whether this is a persistent warp-specialized kernel
:type persistent: bool
:param swizzle: threadblock swizzling functor
"""
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = cutlass.DataType.s8
element_B = cutlass.DataType.s8
layout_A, layout_B, layout_C = layouts
alignment_A, alignment_B, alignment_C = alignments
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
element_C=element_output, element_D=element_output,
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
element_accumulator=element_accumulator)
plan.opclass = opclass
if swizzle is not None:
plan.swizzling_functor = swizzle
td = plan.tile_descriptions()[0]
td.threadblock_shape = threadblock_shape
td.stages = stages
td.cluster_shape = cluster_shape
td.persistent = persistent
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
self.assertTrue(test_all_gemm(op, 'universal'))
if persistent:
suffix = "_persistent"
else:
suffix = ""
element_epilogue = element_accumulator
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages,
opclass=binding_opclass(opclass), suffix=suffix)
setattr(cls, name, run)
return run
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
class GemmS8Sm90(unittest.TestCase):
"""
@ -129,26 +55,40 @@ class GemmS8Sm90(unittest.TestCase):
pass
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
add_test_specialized = partial(add_test_gemm, cls=GemmS8Sm90, element=cutlass.DataType.s8, compilation_modes=['nvcc'])
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
# Tests with 1x1x1 clusters
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNN, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 128, 128], 3)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 128, 128], None)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 8], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 128, 128], None)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [64, 128, 128], None)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 64, 32], None)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [4, 4, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 128, 128], None)
add_test_tensorop(layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=3)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 8], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 128, 128], stages=None)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 64, 32], stages=None)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[ 4, 4, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
# Tests with different cluster shapes
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [2, 2, 1], [128, 128, 128], None)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 4, 1], [128, 128, 128], None)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 2, 1], threadblock_shape=[128, 128, 128], stages=None)
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 4, 1], threadblock_shape=[128, 128, 128], stages=None)
# Tests with persistent warp-specialized threadblocks
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [2, 1, 1], [128, 128, 128], None, persistent=True)
# Tests with warp-specialized ping-pong schedule
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 1, 1], threadblock_shape=[128, 128, 128], stages=None,
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized)
# Tests for SIMT
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
add_test_simt(GemmS8Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [64, 32, 8], 2)
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
add_test_simt(layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 32, 8], stages=2)
if __name__ == '__main__':

View File

@ -0,0 +1,285 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Tests the high-level Conv2d interface
"""
from math import ceil
import unittest
import cutlass
import cutlass_bindings
import cutlass.utils.datatypes as datatypes
from cutlass.backend.utils.device import device_cc
from utils import ExpectException
import os
class Conv2dEquivalence:
"""
Helper class for testing the equivalence of different constructions of the Conv2d interface
"""
def __init__(self, conv_kind, element_A, element_B, element_C, element_D, element_accumulator,
alignment_A, alignment_B, alignment_C):
self.element_A = element_A
self.element_B = element_B
self.element_C = element_C
self.element_D = element_D
self.element_accumulator = element_accumulator
self.alignment_A = alignment_A
self.alignment_B = alignment_B
self.alignment_C = alignment_C
self.conv_kind = conv_kind
self.plan = cutlass.op.Conv2d(
kind=self.conv_kind, element_A=element_A, element_B=element_B, element_C=element_C,
element_D=element_D, element_accumulator=element_accumulator)
self.op = self.plan.construct(
alignment_A=self.alignment_A, alignment_B=self.alignment_B,
alignment_C=self.alignment_C)
def _plans_equal(self, other_plan) -> bool:
"""
Compares whether two plans are equal
:param other_plan: plan to compare against the default Conv2d
:type other_plan: cutlass.op.Conv2d
:return: whether `other_plan` is equivalent to `self.plan`
:rtype: bool
"""
other_op = other_plan.construct(
alignment_A=self.alignment_A, alignment_B=self.alignment_B,
alignment_C=self.alignment_C)
return self.op.rt_module.emit() == other_op.rt_module.emit()
def generic_test(self):
"""
Tests the equivalence of various constructions of the Conv2d interface when using CUTLASS data types
and layouts for constructing the Conv2d interface
"""
if not datatypes.numpy_available:
return
# Test when specifying all parameters
plan_other = cutlass.op.Conv2d(
kind=self.conv_kind,
element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
element_D=self.element_D, element_accumulator=self.element_accumulator)
assert self._plans_equal(plan_other)
# Test when specifying all parameters but A
plan_other = cutlass.op.Conv2d(
kind=self.conv_kind,
element_B=self.element_B, element_C=self.element_C,
element_D=self.element_D, element_accumulator=self.element_accumulator,
element=self.element_A)
assert self._plans_equal(plan_other)
# Test when specifying all parameters but A and B as tensors using generic element and output
plan_other = cutlass.op.Conv2d(
kind=self.conv_kind,
element_C=self.element_C,
element_D=self.element_D, element_accumulator=self.element_accumulator,
element=self.element_A)
assert self._plans_equal(plan_other)
# Test without explicit accumulator. Only run if the type of C and the accumulator are equal
if self.element_C == self.element_accumulator:
plan_other = cutlass.op.Conv2d(
kind=self.conv_kind,
element_C=self.element_C,
element_D=self.element_D,
element=self.element_A)
assert self._plans_equal(plan_other)
# Test with only the generic types. Only rune if the types of A, B, C, and D are the same
if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
and self.element_A == self.element_accumulator):
plan_other = cutlass.op.Conv2d(kind=self.conv_kind, element=self.element_A)
assert self._plans_equal(plan_other)
def numpy_test(self):
"""
Tests the equivalence of various constructions of the Conv2d interface when using numpy as a frontend
"""
if not datatypes.numpy_available:
return
import numpy as np
type_A = datatypes.numpy_type(self.element_A)
type_B = datatypes.numpy_type(self.element_B)
type_C = datatypes.numpy_type(self.element_C)
type_D = datatypes.numpy_type(self.element_D)
type_accum = datatypes.numpy_type(self.element_accumulator)
size = (2, 2)
A = np.zeros(size, dtype=type_A)
B = np.zeros(size, dtype=type_B)
C = np.zeros(size, dtype=type_C)
D = np.zeros(size, dtype=type_D)
return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
def torch_test(self):
"""
Tests the equivalence of various constructions of the Conv2d interface when using torch as a frontend
"""
if not datatypes.torch_available:
return
import torch
type_A = datatypes.torch_type(self.element_A)
type_B = datatypes.torch_type(self.element_B)
type_C = datatypes.torch_type(self.element_C)
type_D = datatypes.torch_type(self.element_D)
type_accum = datatypes.torch_type(self.element_accumulator)
size = (2, 2)
A = torch.empty(size, dtype=type_A)
B = torch.empty(size, dtype=type_B)
C = torch.empty(size, dtype=type_C)
D = torch.empty(size, dtype=type_D)
return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
def tensor_test(self, type_A, type_B, type_C, type_D, type_accum, A, B, C, D):
# Test when specifying all parameters via tensors
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D, element_accumulator=type_accum)
assert self._plans_equal(plan_np)
# Test when specifying all parameters but A as tensors
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A)
assert self._plans_equal(plan_np)
# Test when specifying all parameters but A and B as tensors and using generic element and output
if type_A == type_B:
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, C=C, D=D, element_accumulator=type_accum, element=type_A)
assert self._plans_equal(plan_np)
# Test without explicit accumulator. Only run if the type of C and the accumulator.
if type_C == type_accum:
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D)
assert self._plans_equal(plan_np)
# Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum):
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, element=type_A)
assert self._plans_equal(plan_np)
def test_all(self):
"""
Runs all tests on the Gemm interface
"""
self.generic_test()
self.numpy_test()
self.torch_test()
@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
class ConvEquivalenceTest(unittest.TestCase):
"""
Tests the equivalence of different constructions of the Conv2d interface
"""
pass
type2alignment = {
cutlass.DataType.f16: 8,
cutlass.DataType.f32: 4
}
def add_test(conv_kind, element_A, element_B, element_C, element_D, element_accumulator):
test_name = f"test_conv2d_{conv_kind}_{element_A}_{element_B}_{element_C}_{element_D}_{element_accumulator}"
def run(self):
conv2d_eq = Conv2dEquivalence(
conv_kind=conv_kind,
element_A=element_A, element_B=element_B,
element_C=element_C, element_D=element_D,
element_accumulator=element_accumulator,
alignment_A=type2alignment[element_A], alignment_B=type2alignment[element_B],
alignment_C=type2alignment[element_C]
)
conv2d_eq.test_all()
setattr(ConvEquivalenceTest, test_name, run)
for conv_kind in ["fprop", "wgrad", "dgrad"]:
for types in [
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16],
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32],
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f16],
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32],
[cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32]
]:
add_test(conv_kind, types[0], types[1], types[2], types[3], types[4])
@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
class Conv2dErrorTests(unittest.TestCase):
"""
Tests various error scenarios that arise with the high-level Gemm interface
"""
def test_alignment(self):
"""
Tests case in which the alignment specified is unsupported
"""
plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
with ExpectException(True, 'Alignment 3 is not supported for F16. The construction should fail.'):
op = plan.construct(alignment_A=3, alignment_B=3, alignment_C=3)
def test_invalid_tile_description(self):
"""
Tests scenarios in which an invalid tile description is provided for a given CC
"""
plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
td = plan.tile_descriptions()[0]
td.threadblock_shape=[17, 32, 5]
plan.tile_description = td
with ExpectException(True, 'The threadblock shape is invalid. The compilation should fail.'):
plan.compile()
# Clean up the error message
os.remove("./cutlass_python_compilation_device_error.txt")
if __name__ == '__main__':
unittest.main()

View File

@ -41,6 +41,7 @@ import cutlass
import cutlass_bindings
import cutlass.utils.datatypes as datatypes
from cutlass.backend.utils.device import device_cc
from utils import ExpectException
class GemmEquivalence:
@ -220,38 +221,6 @@ class GemmEquivalenceTest(unittest.TestCase):
gemm_eq.test_all()
class ExpectException:
"""
Utility class to assert that an exception was raised when expected
Example:
.. highlight:: python
.. code-block:: python
with ExceptionExpected(True, 'Division by zero'):
x = 1.0 / 0.0
:param exception_expected: whether an exception is expected to be raised
:type exception_expected: bool
:param message: message to print if an exception is raised when not expected or vice versa
:type message: str
"""
def __init__(self, exception_expected: bool, message: str = ''):
self.exception_expected = exception_expected
self.message = message
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, traceback):
exception_raised = exc_type is not None
assert self.exception_expected == exception_raised, self.message
# Suppress the exception
return True
class GemmErrorTests(unittest.TestCase):
"""
Tests various error scenarios that arise with the high-level Gemm interface
@ -316,9 +285,22 @@ class GemmErrorTests(unittest.TestCase):
td.stages = 0
plan.construct(td)
with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
td.stages = 3
plan.construct(td)
if cc < 90:
with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
td.stages = 3
plan.construct(td)
else:
original_kschedule = td.kernel_schedule
original_eschedule = td.epilogue_schedule
with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
td.epilogue_schedule = cutlass.EpilogueScheduleType.NoSmemWarpSpecialized
td.stages = 3
plan.construct(td)
# Reset schedules
td.kernel_schedule = original_kschedule
td.epilogue_schedule = original_eschedule
with ExpectException(True, f'Requested too many stages'):
td.stages = 100
@ -335,9 +317,25 @@ class GemmErrorTests(unittest.TestCase):
# Reset cluster shape
td.cluster_shape = cluster_shape
kernel_schedule = td.kernel_schedule
with ExpectException(cc < 90, f'Requested a persistent kernel on SM{cc}'):
with ExpectException(cc < 90, f'Requested a non-auto schedule on SM{cc}'):
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
plan.construct(td)
with ExpectException(True, f'Requested a non-auto kernel schedule with an auto epilogue schedule'):
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
td.epilogue_schedule = cutlass.EpilogueScheduleType.ScheduleAuto
plan.construct(td)
with ExpectException(True, f'Requested an auto kernel schedule with a non-auto epilogue schedule'):
td.kernel_schedule = cutlass.KernelScheduleType.ScheduleAuto
td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
plan.construct(td)
with ExpectException(cc < 90, f'Requested a tile scheduler on SM{cc}'):
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative
td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
td.tile_scheduler = cutlass.TileSchedulerType.StreamK
plan.construct(td)
# Ensure that all returned tile descriptions are unique

View File

@ -0,0 +1,65 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Helper functions & classes for interface test
"""
class ExpectException:
"""
Utility class to assert that an exception was raised when expected
Example:
.. highlight:: python
.. code-block:: python
with ExceptionExpected(True, 'Division by zero'):
x = 1.0 / 0.0
:param exception_expected: whether an exception is expected to be raised
:type exception_expected: bool
:param message: message to print if an exception is raised when not expected or vice versa
:type message: str
"""
def __init__(self, exception_expected: bool, message: str = ''):
self.exception_expected = exception_expected
self.message = message
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, traceback):
exception_raised = exc_type is not None
assert self.exception_expected == exception_raised, self.message
# Suppress the exception
return True