138
test/python/conv2d/conv2d_sm80.py
Normal file
138
test/python/conv2d/conv2d_sm80.py
Normal file
@ -0,0 +1,138 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Low-level functionality tests for Conv2d operands on SM80
|
||||
"""
|
||||
from conv2d_test_utils import *
|
||||
import cutlass
|
||||
import logging
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
|
||||
@unittest.skipIf(device_cc() != cc, 'Device compute capability is invalid for SM80 tests.')
|
||||
class Conv2dSm80(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
conv_problems = get_conv_problems()
|
||||
|
||||
# Tests for optimized & analytic
|
||||
for conv_kind in ["fprop", "wgrad", "dgrad"]:
|
||||
# F16, simt
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="simt", threadblock_shape=[128, 128, 8],
|
||||
warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
|
||||
# F16, tensor op
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
|
||||
# F16, tensor op, analytic iterator
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
|
||||
# F16, tensor op, f32 output
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
|
||||
# F16, tensor op, different tile description
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 64, 32],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
|
||||
# F32, simt
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
|
||||
opclass="simt", threadblock_shape=[128, 128, 8],
|
||||
warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
|
||||
# Tf32, tensorop
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 16],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8]
|
||||
)
|
||||
# Split-K
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
|
||||
split_k_slices=2)
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
|
||||
split_k_slices=5)
|
||||
# Swizzling functor
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 64, 32],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)
|
||||
|
||||
# Tests for few channels and fixed channels
|
||||
# F16, tensor op, few channels
|
||||
for c, tb, stage, inst in zip([2, 1],
|
||||
[[128, 128, 64], [128, 128, 32]],
|
||||
[3, 2],
|
||||
[[16, 8, 16], [16, 8, 8]]):
|
||||
add_test(
|
||||
Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=tb,
|
||||
warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
|
||||
)
|
||||
# F16, tensor op, fixed channels
|
||||
for c in [8, 4, 2]:
|
||||
add_test(
|
||||
Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
|
||||
)
|
||||
|
||||
# Test activations
|
||||
for activation in ["relu", "leaky_relu"]:
|
||||
for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
|
||||
add_test(
|
||||
Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
|
||||
split_k_slices=split_k_slices, activation=activation)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
508
test/python/conv2d/conv2d_test_utils.py
Normal file
508
test/python/conv2d/conv2d_test_utils.py
Normal file
@ -0,0 +1,508 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Util Functions for Conv2d Test
|
||||
"""
|
||||
import torch
|
||||
import cutlass
|
||||
import unittest
|
||||
import cutlass_bindings
|
||||
from cutlass.utils.datatypes import binding_type, binding_opclass
|
||||
from cutlass.backend.test.conv2d_testbed import Conv2dLauncher, getTensorRef, getTensorView
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
from cutlass.backend.test.utils import get_name_conv2d
|
||||
import numpy as np
|
||||
|
||||
def conv2d_few_channel_problemsizes(channels):
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
|
||||
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
|
||||
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
|
||||
cutlass_bindings.Tensor4DCoord(16, 7, 7, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
return problem_sizes
|
||||
|
||||
torch_dtype = {
|
||||
cutlass.DataType.f16: torch.float16,
|
||||
cutlass.DataType.f32: torch.float32,
|
||||
cutlass.DataType.f64: torch.float64
|
||||
}
|
||||
|
||||
numpy_dtype = {
|
||||
cutlass.DataType.f16: np.float16,
|
||||
cutlass.DataType.f32: np.float32,
|
||||
cutlass.DataType.f64: np.float64
|
||||
}
|
||||
|
||||
|
||||
def validate_problem_size(ps, conv_kind, split_k_slices):
|
||||
P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
|
||||
Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
|
||||
if P != ps.P or Q != ps.Q:
|
||||
return False
|
||||
|
||||
# Split-K (serial or parallel) is not supported for strided dgrad
|
||||
if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
# Override the backend launcher
|
||||
class Conv2dLauncherFrontend(Conv2dLauncher):
|
||||
def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
|
||||
self.operation = plan
|
||||
self.conv_kind = plan.conv_kind
|
||||
self.seed = seed
|
||||
self.backend = backend
|
||||
|
||||
self.dtype_A = plan._element_a
|
||||
self.dtype_B = plan._element_b
|
||||
self.dtype_C = plan._element_c
|
||||
self.dtype_acc = plan._element_accumulator
|
||||
|
||||
self.layout_A = cutlass_bindings.TensorNHWC
|
||||
self.layout_B = cutlass_bindings.TensorNHWC
|
||||
self.layout_C = cutlass_bindings.TensorNHWC
|
||||
self.layout_D = cutlass_bindings.TensorNHWC
|
||||
|
||||
self.element_compute = cutlass_bindings.float32
|
||||
self.enable_cached_results = True
|
||||
|
||||
# Get randomization_max
|
||||
if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
|
||||
if self.dtype_acc in [cutlass.DataType.f16, cutlass.DataType.bf16]:
|
||||
self.randomization_max = 2
|
||||
else:
|
||||
self.randomization_max = 3
|
||||
else:
|
||||
self.randomization_max = 7
|
||||
|
||||
self.activation = plan.activation
|
||||
|
||||
self.host_conv2d = cutlass_bindings.test.conv.host.conv2d
|
||||
|
||||
|
||||
def set_seed(self):
|
||||
if self.backend == "numpy":
|
||||
np.random.seed(self.seed)
|
||||
else:
|
||||
torch.manual_seed(self.seed)
|
||||
|
||||
def uniform_init(self, size, dtype):
|
||||
if self.backend == "numpy":
|
||||
return super().uniform_init(size, numpy_dtype[dtype])
|
||||
else:
|
||||
tensor = torch.ceil(
|
||||
torch.empty(size=size, dtype=torch_dtype[dtype], device="cuda").uniform_(-self.randomization_max - 0.5, self.randomization_max - 0.5)
|
||||
).to(memory_format=torch.channels_last)
|
||||
return tensor
|
||||
|
||||
def zeros_like(self, tensor):
|
||||
if self.backend == "numpy":
|
||||
return np.zeros_like(tensor)
|
||||
else:
|
||||
return torch.zeros_like(tensor).to(memory_format=torch.channels_last)
|
||||
|
||||
def reference(self, ps, A, B, C, alpha, beta, activation):
|
||||
if self.backend == "numpy":
|
||||
numpy_result = self.host_reference(ps, A, B, C, alpha, beta, activation)
|
||||
return numpy_result
|
||||
else:
|
||||
if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
|
||||
torch_result = alpha * torch.ops.aten.conv2d(
|
||||
A,
|
||||
B,
|
||||
stride=(ps.stride_h, ps.stride_w),
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
dilation=(ps.dilation_h, ps.dilation_w)
|
||||
) + beta * C
|
||||
elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
|
||||
torch_result = alpha * torch.nn.grad.conv2d_input(
|
||||
(ps.N, ps.C, ps.H, ps.W),
|
||||
B,
|
||||
A,
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
stride=(ps.stride_h, ps.stride_w)
|
||||
) + beta * C
|
||||
elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
|
||||
torch_result = alpha * torch.nn.grad.conv2d_weight(
|
||||
B,
|
||||
(ps.K, ps.C, ps.R, ps.S),
|
||||
A,
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
stride=(ps.stride_h, ps.stride_w)
|
||||
) + beta * C
|
||||
else:
|
||||
raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
|
||||
|
||||
if activation == cutlass.backend.epilogue.relu:
|
||||
torch_result = torch.nn.functional.relu(torch_result)
|
||||
elif activation == cutlass.backend.epilogue.leaky_relu:
|
||||
torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
|
||||
|
||||
return torch_result
|
||||
|
||||
def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta, activation):
|
||||
if self.element_compute == cutlass_bindings.float16:
|
||||
alpha = cutlass_bindings.float16(alpha)
|
||||
beta = cutlass_bindings.float16(beta)
|
||||
elif self.element_compute == cutlass_bindings.int32:
|
||||
alpha = int(alpha)
|
||||
beta = int(beta)
|
||||
else:
|
||||
alpha = alpha
|
||||
beta = beta
|
||||
|
||||
# If cached result is loaded
|
||||
cached_result_loaded = False
|
||||
|
||||
if self.enable_cached_results:
|
||||
# Get problem key
|
||||
cached_test_key = cutlass_bindings.test.conv.host.CreateCachedConv2dTestKey(
|
||||
self.conv_kind,
|
||||
problem_size,
|
||||
alpha,
|
||||
beta,
|
||||
getTensorView(
|
||||
tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
|
||||
),
|
||||
getTensorView(
|
||||
tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
|
||||
),
|
||||
getTensorView(
|
||||
tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
|
||||
),
|
||||
)
|
||||
|
||||
cached_test_key.problem = cached_test_key.problem + f"_{activation.tag.split('::')[-1]}"
|
||||
|
||||
cached_test_result = cutlass_bindings.test.conv.host.CachedTestResult()
|
||||
|
||||
conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (
|
||||
self.operation.arch,
|
||||
self.seed,
|
||||
)
|
||||
|
||||
cached_results = cutlass_bindings.test.conv.host.CachedTestResultListing(
|
||||
conv2d_result_cache_name
|
||||
)
|
||||
# CachedTestResultListing cached_results(conv2d_result_cache_name);
|
||||
cached = cached_results.find(cached_test_key)
|
||||
cached_result_loaded = cached[0]
|
||||
if cached_result_loaded:
|
||||
cached_test_result = cached[1]
|
||||
|
||||
if not cached_result_loaded:
|
||||
# Compute the conv2d on host
|
||||
tensor_D_ref = np.ones_like(tensor_C)
|
||||
tensor_ref_A = getTensorRef(
|
||||
tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
|
||||
)
|
||||
tensor_ref_B = getTensorRef(
|
||||
tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
|
||||
)
|
||||
tensor_ref_C = getTensorRef(
|
||||
tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
|
||||
)
|
||||
tensor_ref_D_ref = getTensorRef(
|
||||
tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
|
||||
)
|
||||
|
||||
self.host_conv2d(
|
||||
self.conv_kind,
|
||||
problem_size,
|
||||
tensor_ref_A,
|
||||
tensor_ref_B,
|
||||
tensor_ref_C,
|
||||
tensor_ref_D_ref,
|
||||
alpha,
|
||||
beta,
|
||||
)
|
||||
|
||||
if activation == cutlass.backend.epilogue.leaky_relu:
|
||||
tensor_D_ref = activation.numpy(tensor_D_ref, 0.5)
|
||||
else:
|
||||
tensor_D_ref = activation.numpy(tensor_D_ref)
|
||||
|
||||
tensor_view_D_ref = getTensorView(
|
||||
tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
|
||||
)
|
||||
|
||||
if self.enable_cached_results:
|
||||
cached_test_result.D = cutlass_bindings.test.conv.host.TensorHash(
|
||||
tensor_view_D_ref
|
||||
)
|
||||
cached_results = (
|
||||
cutlass_bindings.test.conv.host.CachedTestResultListing(
|
||||
conv2d_result_cache_name
|
||||
)
|
||||
)
|
||||
cached_results.append(cached_test_key, cached_test_result)
|
||||
cached_results.write(conv2d_result_cache_name)
|
||||
else:
|
||||
return tensor_D_ref
|
||||
|
||||
return cached_test_result.D
|
||||
|
||||
def equal(self, tensor_D, tensor_D_ref, problem_size):
|
||||
if self.backend == "numpy":
|
||||
return super().equal(tensor_D, tensor_D_ref, problem_size)
|
||||
else:
|
||||
torch.cuda.synchronize()
|
||||
return torch.equal(tensor_D, tensor_D_ref)
|
||||
|
||||
|
||||
def run(self, ps, split_k_mode=cutlass_bindings.conv.SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
|
||||
|
||||
#
|
||||
# Initialize input and output tensors
|
||||
#
|
||||
if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
|
||||
if self.backend == "torch":
|
||||
tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
else:
|
||||
tensor_A_size = (ps.N, ps.H, ps.W, ps.C)
|
||||
tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
|
||||
tensor_C_size = (ps.N, ps.P, ps.Q, ps.K)
|
||||
elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
|
||||
if self.backend == "torch":
|
||||
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
else:
|
||||
tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
|
||||
tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
|
||||
tensor_C_size = (ps.N, ps.H, ps.W, ps.C)
|
||||
elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
|
||||
if self.backend == "torch":
|
||||
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
else:
|
||||
tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
|
||||
tensor_B_size = (ps.N, ps.H, ps.W, ps.C)
|
||||
tensor_C_size = (ps.K, ps.R, ps.S, ps.C)
|
||||
else:
|
||||
raise Exception(f"Conv kind {self.conv_kind} is not supported")
|
||||
|
||||
self.set_seed()
|
||||
|
||||
tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
|
||||
tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
|
||||
tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
|
||||
tensor_D = self.zeros_like(tensor_C)
|
||||
|
||||
self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
|
||||
stride=(ps.stride_h, ps.stride_w),
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
dilation=(ps.dilation_h, ps.dilation_w),
|
||||
alpha=alpha, beta=beta,
|
||||
split_k=(split_k_mode, split_k_slices))
|
||||
|
||||
tensor_D_ref = self.reference(
|
||||
ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation
|
||||
)
|
||||
|
||||
return self.equal(tensor_D, tensor_D_ref, ps)
|
||||
|
||||
|
||||
def add_test(
|
||||
cls,
|
||||
cc,
|
||||
conv_kind,
|
||||
problem_sizes,
|
||||
element,
|
||||
element_accumulator,
|
||||
element_output,
|
||||
opclass,
|
||||
threadblock_shape,
|
||||
warp_count,
|
||||
instruction_shape,
|
||||
stages,
|
||||
iterator_algorithm=None,
|
||||
swizzle=None,
|
||||
split_k_mode="serial",
|
||||
split_k_slices=1,
|
||||
activation = "identity"
|
||||
):
|
||||
"""Create a test-running function with the given specification"""
|
||||
test_name = get_name_conv2d(
|
||||
cc, conv_kind, element, element_accumulator,
|
||||
element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
|
||||
iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
|
||||
|
||||
def run(self):
|
||||
# Create the plan
|
||||
plan = cutlass.Conv2d(
|
||||
kind=conv_kind,
|
||||
element=element,
|
||||
element_accumulator=element_accumulator,
|
||||
element_C=element_output,
|
||||
element_D=element_output
|
||||
)
|
||||
|
||||
# Set the opclass
|
||||
plan.opclass = opclass
|
||||
# Set the tile description
|
||||
td = {
|
||||
"threadblock_shape": threadblock_shape,
|
||||
"warp_count": warp_count,
|
||||
"stages": stages,
|
||||
"instruction_shape": instruction_shape,
|
||||
}
|
||||
|
||||
plan.tile_description = td
|
||||
# Set iterator algorithm
|
||||
if iterator_algorithm is not None:
|
||||
plan.iterator_algorithm = iterator_algorithm
|
||||
# Set swizzling functor
|
||||
if swizzle is not None:
|
||||
plan.swizzling_stride = swizzle
|
||||
|
||||
if activation != "identity":
|
||||
if activation == "leaky_relu":
|
||||
plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
|
||||
else:
|
||||
plan.activation = getattr(cutlass.epilogue, activation)
|
||||
|
||||
conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="numpy")
|
||||
|
||||
for ps in problem_sizes:
|
||||
if not validate_problem_size(ps, conv_kind, split_k_slices): continue
|
||||
|
||||
self.assertTrue(
|
||||
conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 0.5)
|
||||
)
|
||||
|
||||
setattr(cls, test_name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
def get_conv_problems():
|
||||
# 64: minimum channel size
|
||||
conv_problems = list(cutlass_bindings.test.conv.TestbedConv2dProblemSizes(64).conv2d_default_sizes)
|
||||
# Insert alignment 4 & 2 tests
|
||||
conv_problems += [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
|
||||
cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
|
||||
cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
return conv_problems
|
||||
42
test/python/conv2d/run_all_tests.py
Normal file
42
test/python/conv2d/run_all_tests.py
Normal file
@ -0,0 +1,42 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import unittest
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
loader = unittest.TestLoader()
|
||||
tests = loader.discover('./', 'conv2d_*.py')
|
||||
testRunner = unittest.runner.TextTestRunner()
|
||||
results = testRunner.run(tests)
|
||||
if not results.wasSuccessful():
|
||||
raise Exception('Test cases failed')
|
||||
@ -39,6 +39,7 @@ import tempfile
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
import cutlass_bindings
|
||||
|
||||
if cutlass.utils.datatypes.torch_available:
|
||||
import torch
|
||||
@ -85,6 +86,34 @@ def _generate_problems(dtype, num):
|
||||
Ds.append(D)
|
||||
return As, Bs, Cs, Ds
|
||||
|
||||
def _generate_conv2d_problem(conv_kind, dtype, ps):
|
||||
"""
|
||||
Utility function to generate conv2d inputs
|
||||
|
||||
:param conv_kind: kind of convolution
|
||||
:type conv_kind: str
|
||||
:param dtype: data type of tensors
|
||||
:param problem_size: the conv2d problem size
|
||||
:type problem_size: cutlass_bindings.conv.Conv2dProblemSize
|
||||
|
||||
:return: initialized tensors A, B, C, and D
|
||||
:rtype: list
|
||||
"""
|
||||
if conv_kind == "fprop":
|
||||
tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
elif conv_kind == "dgrad":
|
||||
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
else:
|
||||
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
sizes = [tensor_A_size, tensor_B_size, tensor_C_size]
|
||||
return [torch.ceil(torch.empty(size, dtype=dtype, device='cuda').uniform_(-4.5, 3.5)).to(memory_format=torch.channels_last) for size in sizes]
|
||||
|
||||
|
||||
@unittest.skipIf(not cutlass.utils.datatypes.torch_available, 'PyTorch must be available to run PyTorch extension tests')
|
||||
class PyTorchExtensionTest(unittest.TestCase):
|
||||
@ -155,6 +184,127 @@ class PyTorchExtensionTest(unittest.TestCase):
|
||||
Ds_ref = [(a @ b) * alpha + (beta * c) for a, b, c in zip(As, Bs, Cs)]
|
||||
Ds = mod.run(As, Bs, Cs, alpha, beta)
|
||||
check_all(Ds, Ds_ref)
|
||||
|
||||
def test_conv2d_fprop(self):
|
||||
torch.manual_seed(2023)
|
||||
|
||||
dtype = torch.float16
|
||||
plan = cutlass.op.Conv2d(kind="fprop", element=dtype, element_accumulator=torch.float32)
|
||||
plan.activation = "relu"
|
||||
|
||||
op = plan.construct()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
mod = cutlass.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
|
||||
|
||||
problem_size = cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
)
|
||||
|
||||
A, B, C = _generate_conv2d_problem("fprop", dtype, problem_size)
|
||||
stride = (problem_size.stride_h, problem_size.stride_w)
|
||||
padding = (problem_size.pad_h, problem_size.pad_w)
|
||||
|
||||
alpha = 1.0
|
||||
beta = 0.5
|
||||
|
||||
D_ref = alpha * torch.ops.aten.conv2d(
|
||||
A, B, stride=stride, padding=padding
|
||||
) + beta * C
|
||||
D_ref = torch.nn.functional.relu(D_ref)
|
||||
D = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta)
|
||||
|
||||
assert torch.allclose(D, D_ref)
|
||||
|
||||
# Test serial split-K
|
||||
D_serial_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
|
||||
assert torch.allclose(D, D_serial_split_k)
|
||||
|
||||
# Test parallel split-K
|
||||
D_parallel_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
|
||||
assert torch.allclose(D, D_parallel_split_k)
|
||||
|
||||
|
||||
def test_conv2d_dgrad(self):
|
||||
torch.manual_seed(2023)
|
||||
dtype = torch.float16
|
||||
plan = cutlass.op.Conv2d(kind="dgrad", element=dtype, element_accumulator=torch.float32)
|
||||
|
||||
op = plan.construct()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
mod = cutlass.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
|
||||
|
||||
problem_size = cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
)
|
||||
|
||||
A, B, C = _generate_conv2d_problem("dgrad", dtype, problem_size)
|
||||
stride = (problem_size.stride_h, problem_size.stride_w)
|
||||
padding = (problem_size.pad_h, problem_size.pad_w)
|
||||
|
||||
alpha = 1.0
|
||||
beta = 0.5
|
||||
input_size = (problem_size.N, problem_size.C, problem_size.H, problem_size.W)
|
||||
D_ref = alpha * torch.nn.grad.conv2d_input(
|
||||
input_size, B, A,
|
||||
stride=stride, padding=padding
|
||||
) + beta * C
|
||||
D = mod.run(input_size, A, B, C, stride, padding, alpha=alpha, beta=beta, )
|
||||
|
||||
assert torch.allclose(D, D_ref)
|
||||
|
||||
def test_conv2d_wgrad(self):
|
||||
torch.manual_seed(2023)
|
||||
dtype = torch.float16
|
||||
plan = cutlass.op.Conv2d(kind="wgrad", element=dtype, element_accumulator=torch.float32)
|
||||
|
||||
op = plan.construct()
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
mod = cutlass.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
|
||||
|
||||
problem_size = cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
)
|
||||
|
||||
A, B, C = _generate_conv2d_problem("wgrad", dtype, problem_size)
|
||||
stride = (problem_size.stride_h, problem_size.stride_w)
|
||||
padding = (problem_size.pad_h, problem_size.pad_w)
|
||||
|
||||
alpha = 1.0
|
||||
beta = 0.5
|
||||
weight_size = (problem_size.K, problem_size.C, problem_size.R, problem_size.S)
|
||||
D_ref = alpha * torch.nn.grad.conv2d_weight(
|
||||
B, weight_size, A,
|
||||
stride=stride, padding=padding
|
||||
) + beta * C
|
||||
D = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta)
|
||||
|
||||
assert torch.allclose(D, D_ref)
|
||||
|
||||
# Test serial split-K
|
||||
D_serial_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
|
||||
assert torch.allclose(D, D_serial_split_k)
|
||||
|
||||
# Test parallel split-K
|
||||
D_parallel_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
|
||||
assert torch.allclose(D, D_parallel_split_k)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@ -37,82 +37,16 @@ Low-level functionality tests for GEMM with F16 operands on SM80
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
from cutlass.utils.datatypes import binding_opclass, binding_type
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, get_name
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
|
||||
# Partial specialziation for naming tests
|
||||
bound_type = binding_type(cutlass.DataType.f16)
|
||||
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
|
||||
|
||||
|
||||
def add_test(cls, layouts, alignments, element_output, element_accumulator,
|
||||
threadblock_shape, warp_count, stages, opclass, swizzle=None):
|
||||
"""
|
||||
Create a test-running function with the given specification and set it as a method of `cls`.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param layouts: layouts of A, B, and C operands
|
||||
:type layouts: list or tuple
|
||||
:param alignments: alingments of A, B, and C operands
|
||||
:type alignments: list or tuple
|
||||
:param element_output: data type of the output element
|
||||
:type element_output: cutlass.DataType
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:type element_accumulator: cutlass.DataType
|
||||
:param threadblock_shape: dimensions of threadblock tiles
|
||||
:type threadblock_shape: list or tuple
|
||||
:param warp_count: warps to be launched per threadblock dimension
|
||||
:type warp_count: list or tuple
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpClass
|
||||
:param swizzle: threadblock swizzling functor
|
||||
"""
|
||||
cluster_shape = [1, 1, 1]
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
element_A = cutlass.DataType.f16
|
||||
element_B = cutlass.DataType.f16
|
||||
layout_A, layout_B, layout_C = layouts
|
||||
alignment_A, alignment_B, alignment_C = alignments
|
||||
|
||||
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
|
||||
element_C=element_output, element_D=element_output,
|
||||
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
|
||||
element_accumulator=element_accumulator,
|
||||
kernel_cc=cc)
|
||||
|
||||
plan.opclass = opclass
|
||||
if swizzle is not None:
|
||||
plan.swizzling_functor = swizzle
|
||||
td = plan.tile_descriptions()[0]
|
||||
td.threadblock_shape = threadblock_shape
|
||||
td.stages = stages
|
||||
td.warp_count = warp_count
|
||||
td.cluster_shape = cluster_shape
|
||||
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
|
||||
self.assertTrue(test_all_gemm(op, 'universal'))
|
||||
|
||||
element_epilogue = element_accumulator
|
||||
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
|
||||
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages, opclass=binding_opclass(opclass))
|
||||
setattr(cls, name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class GemmF16Sm80(unittest.TestCase):
|
||||
"""
|
||||
@ -128,40 +62,64 @@ class GemmF16Sm80StreamK(unittest.TestCase):
|
||||
"""
|
||||
pass
|
||||
|
||||
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f16, cc=cc, cluster_shape=[1, 1, 1])
|
||||
|
||||
# Tests using TensorOp
|
||||
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.NNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.NNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.NTN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.NTT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.TTN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.TTT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [64, 128, 32], [1, 2, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 64, 32], [2, 1, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [64, 64, 64], [1, 1, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [4, 4, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [4, 4, 8], cutlass.DataType.f16, cutlass.DataType.f16, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f16, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [64, 64, 64], [1, 1, 1], 5)
|
||||
add_test_tensorop(GemmF16Sm80, LayoutCombination.TNT, [2, 2, 2], cutlass.DataType.f16, cutlass.DataType.f16, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 32], warp_count=[2, 1, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=5)
|
||||
add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
|
||||
# Tests using SIMT
|
||||
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
|
||||
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
|
||||
|
||||
add_test_simt(GemmF16Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 8], [2, 2, 1], 2)
|
||||
add_test_simt(GemmF16Sm80, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [64, 128, 8], [1, 2, 1], 2)
|
||||
add_test_simt(GemmF16Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [128, 64, 8], [2, 1, 1], 2)
|
||||
add_test_simt(GemmF16Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [64, 64, 8], [1, 1, 1], 2)
|
||||
add_test_simt(GemmF16Sm80, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f16, [128, 128, 8], [2, 2, 1], 2)
|
||||
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
|
||||
# Stream K tests
|
||||
add_test_streamk = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
|
||||
add_test_streamk(GemmF16Sm80StreamK, LayoutCombination.NNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_streamk(GemmF16Sm80StreamK, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [64, 64, 64], [1, 1, 1], 5)
|
||||
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
|
||||
add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=5)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
@ -37,87 +37,16 @@ Low-level functionality tests for GEMM with F16 operands on SM90
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
from cutlass.utils.datatypes import binding_opclass, binding_type
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, get_name
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 90
|
||||
|
||||
# Partial specialziation for naming tests
|
||||
bound_type = binding_type(cutlass.DataType.f16)
|
||||
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
|
||||
|
||||
|
||||
def add_test(cls, layouts, alignments, element_output, element_accumulator,
|
||||
cluster_shape, threadblock_shape, stages, opclass,
|
||||
kernel_schedule=cutlass.KernelScheduleType.ScheduleAuto,
|
||||
swizzle=None):
|
||||
"""
|
||||
Create a test-running function with the given specification and set it as a method of `cls`.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param layouts: layouts of A, B, and C operands
|
||||
:type layouts: list or tuple
|
||||
:param alignments: alingments of A, B, and C operands
|
||||
:type alignments: list or tuple
|
||||
:param element_output: data type of the output element
|
||||
:type element_output: cutlass.DataType
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:type element_accumulator: cutlass.DataType
|
||||
:param cluster_shape: dimensions of threadblock cluster
|
||||
:type cluster_shape: list or tuple
|
||||
:param threadblock_shape: dimensions of threadblock tiles
|
||||
:type threadblock_shape: list or tuple
|
||||
:param warp_count: warps to be launched per threadblock dimension
|
||||
:type warp_count: list or tuple
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpClass
|
||||
:param kernel_schedule: kernel schedule type
|
||||
:type kernel_schedule: cutlass.KernelScheduleType
|
||||
:param swizzle: threadblock swizzling functor
|
||||
"""
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
element_A = cutlass.DataType.f16
|
||||
element_B = cutlass.DataType.f16
|
||||
layout_A, layout_B, layout_C = layouts
|
||||
alignment_A, alignment_B, alignment_C = alignments
|
||||
|
||||
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
|
||||
element_C=element_output, element_D=element_output,
|
||||
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
|
||||
element_accumulator=element_accumulator)
|
||||
|
||||
plan.opclass = opclass
|
||||
if swizzle is not None:
|
||||
plan.swizzling_functor = swizzle
|
||||
td = plan.tile_descriptions()[0]
|
||||
td.threadblock_shape = threadblock_shape
|
||||
td.stages = stages
|
||||
td.cluster_shape = cluster_shape
|
||||
td.kernel_schedule = kernel_schedule
|
||||
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
|
||||
self.assertTrue(test_all_gemm(op, 'universal'))
|
||||
|
||||
element_epilogue = element_accumulator
|
||||
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
|
||||
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages,
|
||||
opclass=binding_opclass(opclass), kernel_schedule=kernel_schedule)
|
||||
setattr(cls, name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
|
||||
class GemmF16Sm90(unittest.TestCase):
|
||||
"""
|
||||
@ -126,47 +55,85 @@ class GemmF16Sm90(unittest.TestCase):
|
||||
pass
|
||||
|
||||
|
||||
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
add_test_specialized = partial(add_test_gemm, cls=GemmF16Sm90, element=cutlass.DataType.f16,
|
||||
warp_count=None, compilation_modes=['nvcc'])
|
||||
|
||||
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
|
||||
# Tests with 1x1x1 clusters
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], 3)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.NNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.NTT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass.DataType.f16, cutlass.DataType.f16, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f16, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [64, 64, 64], 5)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [2, 2, 2], cutlass.DataType.f16, cutlass.DataType.f16, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_unit_cluster = partial(add_test_tensorop, cluster_shape=[1, 1, 1])
|
||||
add_test_unit_cluster(layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=3)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 64], stages=5)
|
||||
add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
|
||||
|
||||
# Tests with different cluster shapes
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass.DataType.f16, cutlass.DataType.f16, [2, 2, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [2, 2, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [2, 2, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [2, 2, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [1, 4, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [2, 4, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [4, 1, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, [4, 2, 1], [64, 128, 64], None)
|
||||
add_test_cluster_shape = partial(add_test_tensorop, threadblock_shape=[64, 128, 64], stages=None)
|
||||
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass.DataType.f16,
|
||||
element_accumulator=cutlass.DataType.f16, cluster_shape=[2, 2, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.TNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.NTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.NNN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 2, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[1, 4, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[2, 4, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 1, 1])
|
||||
add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, cluster_shape=[4, 2, 1])
|
||||
|
||||
# Tests for different schedule modes
|
||||
add_test_schedule = partial(add_test, GemmF16Sm90, LayoutCombination.TTN, [8, 8, 4], cutlass.DataType.f32, cutlass.DataType.f32, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
add_test_schedule([1, 1, 1], [128, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong)
|
||||
add_test_schedule([1, 1, 1], [128, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative)
|
||||
add_test_schedule([2, 1, 1], [128, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong)
|
||||
add_test_schedule([2, 1, 1], [128, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative)
|
||||
add_test_schedule([2, 1, 1], [256, 128, 64], None, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative)
|
||||
add_test_schedule([2, 1, 1], [128, 128, 64], 5, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong)
|
||||
add_test_schedule([2, 1, 1], [128, 128, 64], 5, kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative)
|
||||
add_test_schedule = partial(add_test_specialized, layouts=LayoutCombination.TTN, alignments=[8, 8, 4],
|
||||
element_output=cutlass.DataType.f32, element_accumulator=cutlass.DataType.f32,
|
||||
opclass=cutlass.OpcodeClass.TensorOp, threadblock_shape=[128, 128, 64], stages=None)
|
||||
add_test_schedule(
|
||||
cluster_shape=[1, 1, 1],
|
||||
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
|
||||
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
|
||||
)
|
||||
add_test_schedule(
|
||||
cluster_shape=[1, 1, 1],
|
||||
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
|
||||
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
|
||||
)
|
||||
add_test_schedule(
|
||||
cluster_shape=[2, 1, 1],
|
||||
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
|
||||
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized
|
||||
)
|
||||
add_test_schedule(
|
||||
cluster_shape=[2, 1, 1],
|
||||
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedCooperative,
|
||||
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
|
||||
)
|
||||
|
||||
# Tests using SIMT
|
||||
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
|
||||
add_test_simt(GemmF16Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 128, 8], 2)
|
||||
add_test_simt(GemmF16Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [64, 128, 8], 2)
|
||||
add_test_simt(GemmF16Sm90, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [128, 64, 8], 2)
|
||||
add_test_simt(GemmF16Sm90, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f32, [1, 1, 1], [64, 64, 8], 2)
|
||||
add_test_simt(GemmF16Sm90, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f16, cutlass.DataType.f16, [1, 1, 1], [128, 128, 8], 2)
|
||||
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt, alignments=[1, 1, 1], cluster_shape=[1, 1, 1], stages=2)
|
||||
add_test_simt(layouts=LayoutCombination.NNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8])
|
||||
add_test_simt(layouts=LayoutCombination.TNN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8])
|
||||
add_test_simt(layouts=LayoutCombination.NTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 8])
|
||||
add_test_simt(layouts=LayoutCombination.TTN, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 8])
|
||||
add_test_simt(layouts=LayoutCombination.NNT, element_output=cutlass.DataType.f16, element_accumulator=cutlass.DataType.f16, threadblock_shape=[128, 128, 8])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@ -37,83 +37,16 @@ Low-level functionality tests for GEMM with F32 operands on SM80
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
from cutlass.utils.datatypes import binding_opclass, binding_type
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, get_name
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
|
||||
# Partial specialziation for naming tests
|
||||
bound_type = binding_type(cutlass.DataType.f32)
|
||||
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
|
||||
|
||||
|
||||
def add_test(cls, layouts, alignments, element_output, element_accumulator,
|
||||
threadblock_shape, warp_count, stages, opclass, swizzle=None):
|
||||
"""
|
||||
Create a test-running function with the given specification and set it as a method of `cls`.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param layouts: layouts of A, B, and C operands
|
||||
:type layouts: list or tuple
|
||||
:param alignments: alingments of A, B, and C operands
|
||||
:type alignments: list or tuple
|
||||
:param element_output: data type of the output element
|
||||
:type element_output: cutlass.DataType
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:type element_accumulator: cutlass.DataType
|
||||
:param threadblock_shape: dimensions of threadblock tiles
|
||||
:type threadblock_shape: list or tuple
|
||||
:param warp_count: warps to be launched per threadblock dimension
|
||||
:type warp_count: list or tuple
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpClass
|
||||
:param swizzle: threadblock swizzling functor
|
||||
"""
|
||||
|
||||
cluster_shape = [1, 1, 1]
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
element_A = cutlass.DataType.f32
|
||||
element_B = cutlass.DataType.f32
|
||||
layout_A, layout_B, layout_C = layouts
|
||||
alignment_A, alignment_B, alignment_C = alignments
|
||||
|
||||
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
|
||||
element_C=element_output, element_D=element_output,
|
||||
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
|
||||
element_accumulator=element_accumulator,
|
||||
kernel_cc=cc)
|
||||
|
||||
plan.opclass = opclass
|
||||
if swizzle is not None:
|
||||
plan.swizzling_functor = swizzle
|
||||
td = plan.tile_descriptions()[0]
|
||||
td.threadblock_shape = threadblock_shape
|
||||
td.stages = stages
|
||||
td.warp_count = warp_count
|
||||
td.cluster_shape = cluster_shape
|
||||
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
|
||||
|
||||
self.assertTrue(test_all_gemm(op, 'universal'))
|
||||
|
||||
element_epilogue = element_accumulator
|
||||
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
|
||||
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages, opclass=binding_opclass(opclass))
|
||||
setattr(cls, name, run)
|
||||
|
||||
return run
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class GemmF32Sm80(unittest.TestCase):
|
||||
"""
|
||||
@ -130,25 +63,37 @@ class GemmF32Sm80StreamK(unittest.TestCase):
|
||||
pass
|
||||
|
||||
|
||||
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f32, cc=cc, cluster_shape=[1, 1, 1])
|
||||
|
||||
# Tests using TensorOp
|
||||
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
|
||||
add_test_tensorop(GemmF32Sm80, LayoutCombination.NNN, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF32Sm80, LayoutCombination.NNT, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_tensorop(GemmF32Sm80, LayoutCombination.NTN, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [64, 128, 32], [1, 2, 1], 3)
|
||||
add_test_tensorop(GemmF32Sm80, LayoutCombination.NTN, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [64, 64, 32], [1, 1, 1], 4)
|
||||
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 32], warp_count=[1, 1, 1], stages=4)
|
||||
# Tests using SIMT
|
||||
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
|
||||
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
|
||||
|
||||
add_test_simt(GemmF32Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 8], [2, 2, 1], 2)
|
||||
add_test_simt(GemmF32Sm80, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [64, 128, 8], [1, 2, 1], 2)
|
||||
add_test_simt(GemmF32Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [128, 64, 8], [2, 1, 1], 2)
|
||||
add_test_simt(GemmF32Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [64, 64, 8], [1, 1, 1], 2)
|
||||
add_test_simt(GemmF32Sm80, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 8], [2, 2, 1], 2)
|
||||
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
|
||||
# Stream K tests
|
||||
add_test_streamk = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
|
||||
add_test_streamk(GemmF32Sm80StreamK, LayoutCombination.TTN, [4, 4, 4], cutlass.DataType.f32, cutlass.DataType.f32, [128, 128, 32], [2, 2, 1], 3)
|
||||
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
|
||||
add_test_streamk(cls=GemmF32Sm80StreamK, layouts=LayoutCombination.TTN, alignments=[4, 4, 4], element_output=cutlass.DataType.f32,
|
||||
element_accumulator=cutlass.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@ -37,84 +37,16 @@ Low-level functionality tests for GEMM with F64 operands on SM80
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
from cutlass.utils.datatypes import binding_opclass, binding_type
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, get_name
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
|
||||
# Partial specialziation for naming tests
|
||||
bound_type = binding_type(cutlass.DataType.f64)
|
||||
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
|
||||
|
||||
|
||||
def add_test(cls, layouts, alignments, element_output, element_accumulator,
|
||||
threadblock_shape, warp_count, stages, opclass, swizzle=None):
|
||||
"""
|
||||
Create a test-running function with the given specification and set it as a method of `cls`.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param layouts: layouts of A, B, and C operands
|
||||
:type layouts: list or tuple
|
||||
:param alignments: alingments of A, B, and C operands
|
||||
:type alignments: list or tuple
|
||||
:param element_output: data type of the output element
|
||||
:type element_output: cutlass.DataType
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:type element_accumulator: cutlass.DataType
|
||||
:param threadblock_shape: dimensions of threadblock tiles
|
||||
:type threadblock_shape: list or tuple
|
||||
:param warp_count: warps to be launched per threadblock dimension
|
||||
:type warp_count: list or tuple
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpClass
|
||||
:param swizzle: threadblock swizzling functor
|
||||
"""
|
||||
|
||||
cluster_shape = [1, 1, 1]
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
element_A = cutlass.DataType.f64
|
||||
element_B = cutlass.DataType.f64
|
||||
layout_A, layout_B, layout_C = layouts
|
||||
alignment_A, alignment_B, alignment_C = alignments
|
||||
|
||||
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
|
||||
element_C=element_output, element_D=element_output,
|
||||
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
|
||||
element_accumulator=element_accumulator,
|
||||
kernel_cc=cc)
|
||||
|
||||
plan.opclass = opclass
|
||||
if swizzle is not None:
|
||||
plan.swizzling_functor = swizzle
|
||||
td = plan.tile_descriptions()[0]
|
||||
td.threadblock_shape = threadblock_shape
|
||||
td.stages = stages
|
||||
td.warp_count = warp_count
|
||||
td.cluster_shape = cluster_shape
|
||||
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
|
||||
|
||||
self.assertTrue(test_all_gemm(op, 'universal'))
|
||||
|
||||
element_epilogue = element_accumulator
|
||||
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
|
||||
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages, opclass=binding_opclass(opclass))
|
||||
setattr(cls, name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class GemmF64Sm80(unittest.TestCase):
|
||||
"""
|
||||
@ -131,25 +63,36 @@ class GemmF64Sm80StreamK(unittest.TestCase):
|
||||
pass
|
||||
|
||||
|
||||
# Tests using TensorOp
|
||||
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.f64, cc=cc, cluster_shape=[1, 1, 1])
|
||||
|
||||
add_test_tensorop(GemmF64Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 128, 16], [4, 2, 1], 3)
|
||||
add_test_tensorop(GemmF64Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [64, 64, 16], [2, 2, 1], 4)
|
||||
add_test_tensorop(GemmF64Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [32, 32, 16], [2, 1, 1], 5)
|
||||
# Tests using TensorOp
|
||||
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
|
||||
add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 64, 16], warp_count=[2, 2, 1], stages=4)
|
||||
add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 32, 32, 16], warp_count=[2, 1, 1], stages=5)
|
||||
|
||||
# Tests using SIMT
|
||||
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
|
||||
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
|
||||
|
||||
add_test_simt(GemmF64Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 128, 8], [2, 2, 1], 2)
|
||||
add_test_simt(GemmF64Sm80, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [64, 128, 8], [1, 2, 1], 2)
|
||||
add_test_simt(GemmF64Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 64, 8], [2, 1, 1], 2)
|
||||
add_test_simt(GemmF64Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [64, 64, 8], [1, 1, 1], 2)
|
||||
add_test_simt(GemmF64Sm80, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 128, 8], [2, 2, 1], 2)
|
||||
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
|
||||
# Stream K tests
|
||||
add_test_streamk = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
|
||||
add_test_streamk(GemmF64Sm80StreamK, LayoutCombination.NTT, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [128, 128, 16], [4, 2, 1], 3)
|
||||
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
|
||||
add_test_streamk(cls=GemmF64Sm80StreamK, layouts=LayoutCombination.NTT, alignments=[1, 1, 1], element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@ -37,90 +37,16 @@ Low-level functionality tests for GEMM with F64 operands on SM90
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
from cutlass.utils.datatypes import binding_opclass, binding_type
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, get_name
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 90
|
||||
|
||||
# Partial specialziation for naming tests
|
||||
bound_type = binding_type(cutlass.DataType.f64)
|
||||
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
|
||||
|
||||
|
||||
def add_test(cls, layouts, alignments, element_output, element_accumulator,
|
||||
cluster_shape, threadblock_shape, stages, opclass, persistent=False, swizzle=None):
|
||||
"""
|
||||
Create a test-running function with the given specification and set it as a method of `cls`.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param layouts: layouts of A, B, and C operands
|
||||
:type layouts: list or tuple
|
||||
:param alignments: alingments of A, B, and C operands
|
||||
:type alignments: list or tuple
|
||||
:param element_output: data type of the output element
|
||||
:type element_output: cutlass.DataType
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:type element_accumulator: cutlass.DataType
|
||||
:param cluster_shape: dimensions of threadblock cluster
|
||||
:type cluster_shape: list or tuple
|
||||
:param threadblock_shape: dimensions of threadblock tiles
|
||||
:type threadblock_shape: list or tuple
|
||||
:param warp_count: warps to be launched per threadblock dimension
|
||||
:type warp_count: list or tuple
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpClass
|
||||
:param persistent: whether this is a persistent warp-specialized kernel
|
||||
:type persistent: bool
|
||||
:param swizzle: threadblock swizzling functor
|
||||
"""
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
element_A = cutlass.DataType.f64
|
||||
element_B = cutlass.DataType.f64
|
||||
layout_A, layout_B, layout_C = layouts
|
||||
alignment_A, alignment_B, alignment_C = alignments
|
||||
|
||||
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
|
||||
element_C=element_output, element_D=element_output,
|
||||
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
|
||||
element_accumulator=element_accumulator)
|
||||
|
||||
plan.opclass = opclass
|
||||
if swizzle is not None:
|
||||
plan.swizzling_functor = swizzle
|
||||
td = plan.tile_descriptions()[0]
|
||||
td.threadblock_shape = threadblock_shape
|
||||
td.stages = stages
|
||||
td.cluster_shape = cluster_shape
|
||||
td.persistent = persistent
|
||||
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
|
||||
self.assertTrue(test_all_gemm(op, 'universal'))
|
||||
|
||||
if persistent:
|
||||
suffix = "_persistent"
|
||||
else:
|
||||
suffix = ""
|
||||
|
||||
element_epilogue = element_accumulator
|
||||
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
|
||||
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages,
|
||||
opclass=binding_opclass(opclass), suffix=suffix)
|
||||
setattr(cls, name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
|
||||
class GemmF64Sm90(unittest.TestCase):
|
||||
"""
|
||||
@ -129,13 +55,14 @@ class GemmF64Sm90(unittest.TestCase):
|
||||
pass
|
||||
|
||||
|
||||
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
|
||||
add_test_specialized = partial(add_test_gemm, cls=GemmF64Sm90, alignments=[1, 1, 1], cluster_shape=[1, 1, 1],
|
||||
element=cutlass.DataType.f64, element_output=cutlass.DataType.f64,
|
||||
element_accumulator=cutlass.DataType.f64, compilation_modes=['nvcc'])
|
||||
|
||||
add_test_tensorop(GemmF64Sm90, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [1, 1, 1], [128, 128, 32], 3)
|
||||
add_test_tensorop(GemmF64Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [1, 1, 1], [128, 128, 32], 3)
|
||||
add_test_simt(GemmF64Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [1, 1, 1], [128, 128, 8], 2)
|
||||
add_test_simt(GemmF64Sm90, LayoutCombination.TTT, [1, 1, 1], cutlass.DataType.f64, cutlass.DataType.f64, [1, 1, 1], [64, 128, 8], 2)
|
||||
add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.NNT, threadblock_shape=[128, 128, 32], stages=3)
|
||||
add_test_specialized(opclass=cutlass.OpcodeClass.TensorOp, layouts=LayoutCombination.TNN, threadblock_shape=[128, 128, 32], stages=3)
|
||||
add_test_specialized( opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.NNN, threadblock_shape=[128, 128, 8], stages=2)
|
||||
add_test_specialized( opclass=cutlass.OpcodeClass.Simt, layouts=LayoutCombination.TTT, threadblock_shape=[ 64, 128, 8], stages=2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@ -37,84 +37,16 @@ Low-level functionality tests for GEMM with S8 operands on SM80
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
from cutlass.utils.datatypes import binding_opclass, binding_type
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, get_name
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
|
||||
# Partial specialziation for naming tests
|
||||
bound_type = binding_type(cutlass.DataType.s8)
|
||||
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
|
||||
|
||||
|
||||
def add_test(cls, layouts, alignments, element_output, element_accumulator,
|
||||
threadblock_shape, warp_count, stages, opclass, swizzle=None):
|
||||
"""
|
||||
Create a test-running function with the given specification and set it as a method of `cls`.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param layouts: layouts of A, B, and C operands
|
||||
:type layouts: list or tuple
|
||||
:param alignments: alingments of A, B, and C operands
|
||||
:type alignments: list or tuple
|
||||
:param element_output: data type of the output element
|
||||
:type element_output: cutlass.DataType
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:type element_accumulator: cutlass.DataType
|
||||
:param threadblock_shape: dimensions of threadblock tiles
|
||||
:type threadblock_shape: list or tuple
|
||||
:param warp_count: warps to be launched per threadblock dimension
|
||||
:type warp_count: list or tuple
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpClass
|
||||
:param swizzle: threadblock swizzling functor
|
||||
"""
|
||||
|
||||
cluster_shape = [1, 1, 1]
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
element_A = cutlass.DataType.s8
|
||||
element_B = cutlass.DataType.s8
|
||||
layout_A, layout_B, layout_C = layouts
|
||||
alignment_A, alignment_B, alignment_C = alignments
|
||||
|
||||
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
|
||||
element_C=element_output, element_D=element_output,
|
||||
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
|
||||
element_accumulator=element_accumulator,
|
||||
kernel_cc=cc)
|
||||
|
||||
plan.opclass = opclass
|
||||
if swizzle is not None:
|
||||
plan.swizzling_functor = swizzle
|
||||
td = plan.tile_descriptions()[0]
|
||||
td.threadblock_shape = threadblock_shape
|
||||
td.stages = stages
|
||||
td.warp_count = warp_count
|
||||
td.cluster_shape = cluster_shape
|
||||
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
|
||||
|
||||
self.assertTrue(test_all_gemm(op, 'universal'))
|
||||
|
||||
element_epilogue = element_accumulator
|
||||
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
|
||||
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages, opclass=binding_opclass(opclass))
|
||||
setattr(cls, name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class GemmS8Sm80(unittest.TestCase):
|
||||
"""
|
||||
@ -131,25 +63,36 @@ class GemmS8Sm80StreamK(unittest.TestCase):
|
||||
pass
|
||||
|
||||
|
||||
# Tests using TensorOp
|
||||
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
add_test_specialized = partial(add_test_gemm, element=cutlass.DataType.s8, cc=cc, cluster_shape=[1, 1, 1])
|
||||
|
||||
add_test_tensorop(GemmS8Sm80, LayoutCombination.TNN, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [256, 128, 64], [4, 2, 1], 3)
|
||||
add_test_tensorop(GemmS8Sm80, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [128, 256, 64], [2, 4, 1], 3)
|
||||
add_test_tensorop(GemmS8Sm80, LayoutCombination.TNN, [16, 16, 4], cutlass.DataType.s32, cutlass.DataType.s32, [64, 64, 64], [1, 1, 1], 4)
|
||||
# Tests using TensorOp
|
||||
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
|
||||
add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[256, 128, 64], warp_count=[4, 2, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
|
||||
add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 4], element_output=cutlass.DataType.s32,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 64, 64], warp_count=[1, 1, 1], stages=4)
|
||||
|
||||
# Tests using SIMT
|
||||
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
|
||||
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
|
||||
|
||||
add_test_simt(GemmS8Sm80, LayoutCombination.NNN, [1, 1, 1], cutlass.DataType.s8, cutlass.DataType.s32, [128, 128, 8], [2, 2, 1], 2)
|
||||
add_test_simt(GemmS8Sm80, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.s8, cutlass.DataType.s32, [64, 128, 8], [1, 2, 1], 2)
|
||||
add_test_simt(GemmS8Sm80, LayoutCombination.NTN, [1, 1, 1], cutlass.DataType.s8, cutlass.DataType.s32, [128, 64, 8], [2, 1, 1], 2)
|
||||
add_test_simt(GemmS8Sm80, LayoutCombination.TTN, [1, 1, 1], cutlass.DataType.s32, cutlass.DataType.s32, [64, 64, 8], [1, 1, 1], 2)
|
||||
add_test_simt(GemmS8Sm80, LayoutCombination.NNT, [1, 1, 1], cutlass.DataType.s32, cutlass.DataType.s32, [128, 128, 8], [2, 2, 1], 2)
|
||||
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
|
||||
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 64, 8], warp_count=[2, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass.DataType.s32,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[ 64, 64, 8], warp_count=[1, 1, 1], stages=2)
|
||||
add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass.DataType.s32,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
|
||||
|
||||
# Stream K tests
|
||||
add_test_streamk = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
|
||||
add_test_streamk(GemmS8Sm80StreamK, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [128, 256, 64], [2, 4, 1], 3)
|
||||
add_test_streamk = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp, swizzle=cutlass.swizzle.ThreadblockSwizzleStreamK)
|
||||
add_test_streamk(cls=GemmS8Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@ -37,90 +37,16 @@ Low-level functionality tests for GEMM with S8 operands on SM90
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
from cutlass.utils.datatypes import binding_opclass, binding_type
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, get_name
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 90
|
||||
|
||||
# Partial specialziation for naming tests
|
||||
bound_type = binding_type(cutlass.DataType.s8)
|
||||
name_fn = partial(get_name, element_a=bound_type, element_b=bound_type, arch=cc)
|
||||
|
||||
|
||||
def add_test(cls, layouts, alignments, element_output, element_accumulator,
|
||||
cluster_shape, threadblock_shape, stages, opclass, persistent=False, swizzle=None):
|
||||
"""
|
||||
Create a test-running function with the given specification and set it as a method of `cls`.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param layouts: layouts of A, B, and C operands
|
||||
:type layouts: list or tuple
|
||||
:param alignments: alingments of A, B, and C operands
|
||||
:type alignments: list or tuple
|
||||
:param element_output: data type of the output element
|
||||
:type element_output: cutlass.DataType
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:type element_accumulator: cutlass.DataType
|
||||
:param cluster_shape: dimensions of threadblock cluster
|
||||
:type cluster_shape: list or tuple
|
||||
:param threadblock_shape: dimensions of threadblock tiles
|
||||
:type threadblock_shape: list or tuple
|
||||
:param warp_count: warps to be launched per threadblock dimension
|
||||
:type warp_count: list or tuple
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpClass
|
||||
:param persistent: whether this is a persistent warp-specialized kernel
|
||||
:type persistent: bool
|
||||
:param swizzle: threadblock swizzling functor
|
||||
"""
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
element_A = cutlass.DataType.s8
|
||||
element_B = cutlass.DataType.s8
|
||||
layout_A, layout_B, layout_C = layouts
|
||||
alignment_A, alignment_B, alignment_C = alignments
|
||||
|
||||
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
|
||||
element_C=element_output, element_D=element_output,
|
||||
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
|
||||
element_accumulator=element_accumulator)
|
||||
|
||||
plan.opclass = opclass
|
||||
if swizzle is not None:
|
||||
plan.swizzling_functor = swizzle
|
||||
td = plan.tile_descriptions()[0]
|
||||
td.threadblock_shape = threadblock_shape
|
||||
td.stages = stages
|
||||
td.cluster_shape = cluster_shape
|
||||
td.persistent = persistent
|
||||
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
|
||||
self.assertTrue(test_all_gemm(op, 'universal'))
|
||||
|
||||
if persistent:
|
||||
suffix = "_persistent"
|
||||
else:
|
||||
suffix = ""
|
||||
|
||||
element_epilogue = element_accumulator
|
||||
name = name_fn(layouts, alignments, binding_type(element_output), binding_type(element_accumulator),
|
||||
binding_type(element_epilogue), cluster_shape, threadblock_shape, stages,
|
||||
opclass=binding_opclass(opclass), suffix=suffix)
|
||||
setattr(cls, name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
|
||||
class GemmS8Sm90(unittest.TestCase):
|
||||
"""
|
||||
@ -129,26 +55,40 @@ class GemmS8Sm90(unittest.TestCase):
|
||||
pass
|
||||
|
||||
|
||||
add_test_tensorop = partial(add_test, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
add_test_specialized = partial(add_test_gemm, cls=GemmS8Sm90, element=cutlass.DataType.s8, compilation_modes=['nvcc'])
|
||||
|
||||
add_test_tensorop = partial(add_test_specialized, opclass=cutlass.OpcodeClass.TensorOp)
|
||||
|
||||
# Tests with 1x1x1 clusters
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNN, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 128, 128], 3)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 128, 128], None)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 8], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 128, 128], None)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [64, 128, 128], None)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 64, 32], None)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [4, 4, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [128, 128, 128], None)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=3)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 8], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 128, 128], stages=None)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 64, 32], stages=None)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[ 4, 4, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
|
||||
|
||||
# Tests with different cluster shapes
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [2, 2, 1], [128, 128, 128], None)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [1, 4, 1], [128, 128, 128], None)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 2, 1], threadblock_shape=[128, 128, 128], stages=None)
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 4, 1], threadblock_shape=[128, 128, 128], stages=None)
|
||||
|
||||
# Tests with persistent warp-specialized threadblocks
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass.DataType.s8, cutlass.DataType.s32, [2, 1, 1], [128, 128, 128], None, persistent=True)
|
||||
# Tests with warp-specialized ping-pong schedule
|
||||
add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[2, 1, 1], threadblock_shape=[128, 128, 128], stages=None,
|
||||
kernel_schedule=cutlass.KernelScheduleType.TmaWarpSpecializedPingpong,
|
||||
epilogue_schedule=cutlass.EpilogueScheduleType.TmaWarpSpecialized)
|
||||
|
||||
# Tests for SIMT
|
||||
add_test_simt = partial(add_test, opclass=cutlass.OpcodeClass.Simt)
|
||||
add_test_simt(GemmS8Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass.DataType.s8, cutlass.DataType.s32, [1, 1, 1], [64, 32, 8], 2)
|
||||
add_test_simt = partial(add_test_specialized, opclass=cutlass.OpcodeClass.Simt)
|
||||
add_test_simt(layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass.DataType.s8,
|
||||
element_accumulator=cutlass.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 32, 8], stages=2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
285
test/python/interface/conv2d_interface.py
Normal file
285
test/python/interface/conv2d_interface.py
Normal file
@ -0,0 +1,285 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Tests the high-level Conv2d interface
|
||||
"""
|
||||
|
||||
from math import ceil
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
import cutlass_bindings
|
||||
import cutlass.utils.datatypes as datatypes
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
from utils import ExpectException
|
||||
import os
|
||||
|
||||
|
||||
class Conv2dEquivalence:
|
||||
"""
|
||||
Helper class for testing the equivalence of different constructions of the Conv2d interface
|
||||
"""
|
||||
def __init__(self, conv_kind, element_A, element_B, element_C, element_D, element_accumulator,
|
||||
alignment_A, alignment_B, alignment_C):
|
||||
|
||||
self.element_A = element_A
|
||||
self.element_B = element_B
|
||||
self.element_C = element_C
|
||||
self.element_D = element_D
|
||||
self.element_accumulator = element_accumulator
|
||||
self.alignment_A = alignment_A
|
||||
self.alignment_B = alignment_B
|
||||
self.alignment_C = alignment_C
|
||||
|
||||
self.conv_kind = conv_kind
|
||||
|
||||
self.plan = cutlass.op.Conv2d(
|
||||
kind=self.conv_kind, element_A=element_A, element_B=element_B, element_C=element_C,
|
||||
element_D=element_D, element_accumulator=element_accumulator)
|
||||
|
||||
self.op = self.plan.construct(
|
||||
alignment_A=self.alignment_A, alignment_B=self.alignment_B,
|
||||
alignment_C=self.alignment_C)
|
||||
|
||||
def _plans_equal(self, other_plan) -> bool:
|
||||
"""
|
||||
Compares whether two plans are equal
|
||||
|
||||
:param other_plan: plan to compare against the default Conv2d
|
||||
:type other_plan: cutlass.op.Conv2d
|
||||
|
||||
:return: whether `other_plan` is equivalent to `self.plan`
|
||||
:rtype: bool
|
||||
"""
|
||||
other_op = other_plan.construct(
|
||||
alignment_A=self.alignment_A, alignment_B=self.alignment_B,
|
||||
alignment_C=self.alignment_C)
|
||||
|
||||
return self.op.rt_module.emit() == other_op.rt_module.emit()
|
||||
|
||||
def generic_test(self):
|
||||
"""
|
||||
Tests the equivalence of various constructions of the Conv2d interface when using CUTLASS data types
|
||||
and layouts for constructing the Conv2d interface
|
||||
"""
|
||||
if not datatypes.numpy_available:
|
||||
return
|
||||
|
||||
# Test when specifying all parameters
|
||||
plan_other = cutlass.op.Conv2d(
|
||||
kind=self.conv_kind,
|
||||
element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
|
||||
element_D=self.element_D, element_accumulator=self.element_accumulator)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
# Test when specifying all parameters but A
|
||||
plan_other = cutlass.op.Conv2d(
|
||||
kind=self.conv_kind,
|
||||
element_B=self.element_B, element_C=self.element_C,
|
||||
element_D=self.element_D, element_accumulator=self.element_accumulator,
|
||||
element=self.element_A)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
# Test when specifying all parameters but A and B as tensors using generic element and output
|
||||
plan_other = cutlass.op.Conv2d(
|
||||
kind=self.conv_kind,
|
||||
element_C=self.element_C,
|
||||
element_D=self.element_D, element_accumulator=self.element_accumulator,
|
||||
element=self.element_A)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
# Test without explicit accumulator. Only run if the type of C and the accumulator are equal
|
||||
if self.element_C == self.element_accumulator:
|
||||
plan_other = cutlass.op.Conv2d(
|
||||
kind=self.conv_kind,
|
||||
element_C=self.element_C,
|
||||
element_D=self.element_D,
|
||||
element=self.element_A)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
# Test with only the generic types. Only rune if the types of A, B, C, and D are the same
|
||||
if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
|
||||
and self.element_A == self.element_accumulator):
|
||||
plan_other = cutlass.op.Conv2d(kind=self.conv_kind, element=self.element_A)
|
||||
assert self._plans_equal(plan_other)
|
||||
|
||||
def numpy_test(self):
|
||||
"""
|
||||
Tests the equivalence of various constructions of the Conv2d interface when using numpy as a frontend
|
||||
"""
|
||||
if not datatypes.numpy_available:
|
||||
return
|
||||
|
||||
import numpy as np
|
||||
type_A = datatypes.numpy_type(self.element_A)
|
||||
type_B = datatypes.numpy_type(self.element_B)
|
||||
type_C = datatypes.numpy_type(self.element_C)
|
||||
type_D = datatypes.numpy_type(self.element_D)
|
||||
type_accum = datatypes.numpy_type(self.element_accumulator)
|
||||
|
||||
size = (2, 2)
|
||||
A = np.zeros(size, dtype=type_A)
|
||||
B = np.zeros(size, dtype=type_B)
|
||||
C = np.zeros(size, dtype=type_C)
|
||||
D = np.zeros(size, dtype=type_D)
|
||||
|
||||
return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
|
||||
|
||||
def torch_test(self):
|
||||
"""
|
||||
Tests the equivalence of various constructions of the Conv2d interface when using torch as a frontend
|
||||
"""
|
||||
if not datatypes.torch_available:
|
||||
return
|
||||
|
||||
import torch
|
||||
type_A = datatypes.torch_type(self.element_A)
|
||||
type_B = datatypes.torch_type(self.element_B)
|
||||
type_C = datatypes.torch_type(self.element_C)
|
||||
type_D = datatypes.torch_type(self.element_D)
|
||||
type_accum = datatypes.torch_type(self.element_accumulator)
|
||||
|
||||
size = (2, 2)
|
||||
|
||||
A = torch.empty(size, dtype=type_A)
|
||||
B = torch.empty(size, dtype=type_B)
|
||||
C = torch.empty(size, dtype=type_C)
|
||||
D = torch.empty(size, dtype=type_D)
|
||||
|
||||
return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
|
||||
|
||||
def tensor_test(self, type_A, type_B, type_C, type_D, type_accum, A, B, C, D):
|
||||
# Test when specifying all parameters via tensors
|
||||
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D, element_accumulator=type_accum)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
# Test when specifying all parameters but A as tensors
|
||||
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
# Test when specifying all parameters but A and B as tensors and using generic element and output
|
||||
if type_A == type_B:
|
||||
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, C=C, D=D, element_accumulator=type_accum, element=type_A)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
# Test without explicit accumulator. Only run if the type of C and the accumulator.
|
||||
if type_C == type_accum:
|
||||
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
# Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
|
||||
if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum):
|
||||
plan_np = cutlass.op.Conv2d(kind=self.conv_kind, element=type_A)
|
||||
assert self._plans_equal(plan_np)
|
||||
|
||||
def test_all(self):
|
||||
"""
|
||||
Runs all tests on the Gemm interface
|
||||
"""
|
||||
self.generic_test()
|
||||
self.numpy_test()
|
||||
self.torch_test()
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class ConvEquivalenceTest(unittest.TestCase):
|
||||
"""
|
||||
Tests the equivalence of different constructions of the Conv2d interface
|
||||
"""
|
||||
pass
|
||||
|
||||
type2alignment = {
|
||||
cutlass.DataType.f16: 8,
|
||||
cutlass.DataType.f32: 4
|
||||
}
|
||||
|
||||
def add_test(conv_kind, element_A, element_B, element_C, element_D, element_accumulator):
|
||||
|
||||
test_name = f"test_conv2d_{conv_kind}_{element_A}_{element_B}_{element_C}_{element_D}_{element_accumulator}"
|
||||
|
||||
def run(self):
|
||||
conv2d_eq = Conv2dEquivalence(
|
||||
conv_kind=conv_kind,
|
||||
element_A=element_A, element_B=element_B,
|
||||
element_C=element_C, element_D=element_D,
|
||||
element_accumulator=element_accumulator,
|
||||
alignment_A=type2alignment[element_A], alignment_B=type2alignment[element_B],
|
||||
alignment_C=type2alignment[element_C]
|
||||
)
|
||||
conv2d_eq.test_all()
|
||||
|
||||
setattr(ConvEquivalenceTest, test_name, run)
|
||||
|
||||
for conv_kind in ["fprop", "wgrad", "dgrad"]:
|
||||
for types in [
|
||||
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16],
|
||||
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32],
|
||||
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f16],
|
||||
[cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32],
|
||||
[cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32]
|
||||
]:
|
||||
add_test(conv_kind, types[0], types[1], types[2], types[3], types[4])
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
|
||||
class Conv2dErrorTests(unittest.TestCase):
|
||||
"""
|
||||
Tests various error scenarios that arise with the high-level Gemm interface
|
||||
"""
|
||||
|
||||
def test_alignment(self):
|
||||
"""
|
||||
Tests case in which the alignment specified is unsupported
|
||||
"""
|
||||
plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
|
||||
|
||||
with ExpectException(True, 'Alignment 3 is not supported for F16. The construction should fail.'):
|
||||
op = plan.construct(alignment_A=3, alignment_B=3, alignment_C=3)
|
||||
|
||||
def test_invalid_tile_description(self):
|
||||
"""
|
||||
Tests scenarios in which an invalid tile description is provided for a given CC
|
||||
"""
|
||||
plan = cutlass.op.Conv2d(kind="fprop", element=cutlass.DataType.f16)
|
||||
|
||||
td = plan.tile_descriptions()[0]
|
||||
td.threadblock_shape=[17, 32, 5]
|
||||
|
||||
plan.tile_description = td
|
||||
with ExpectException(True, 'The threadblock shape is invalid. The compilation should fail.'):
|
||||
plan.compile()
|
||||
# Clean up the error message
|
||||
os.remove("./cutlass_python_compilation_device_error.txt")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@ -41,6 +41,7 @@ import cutlass
|
||||
import cutlass_bindings
|
||||
import cutlass.utils.datatypes as datatypes
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
from utils import ExpectException
|
||||
|
||||
|
||||
class GemmEquivalence:
|
||||
@ -220,38 +221,6 @@ class GemmEquivalenceTest(unittest.TestCase):
|
||||
gemm_eq.test_all()
|
||||
|
||||
|
||||
class ExpectException:
|
||||
"""
|
||||
Utility class to assert that an exception was raised when expected
|
||||
|
||||
Example:
|
||||
|
||||
.. highlight:: python
|
||||
.. code-block:: python
|
||||
|
||||
with ExceptionExpected(True, 'Division by zero'):
|
||||
x = 1.0 / 0.0
|
||||
|
||||
:param exception_expected: whether an exception is expected to be raised
|
||||
:type exception_expected: bool
|
||||
:param message: message to print if an exception is raised when not expected or vice versa
|
||||
:type message: str
|
||||
"""
|
||||
def __init__(self, exception_expected: bool, message: str = ''):
|
||||
self.exception_expected = exception_expected
|
||||
self.message = message
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, traceback):
|
||||
exception_raised = exc_type is not None
|
||||
assert self.exception_expected == exception_raised, self.message
|
||||
|
||||
# Suppress the exception
|
||||
return True
|
||||
|
||||
|
||||
class GemmErrorTests(unittest.TestCase):
|
||||
"""
|
||||
Tests various error scenarios that arise with the high-level Gemm interface
|
||||
@ -316,9 +285,22 @@ class GemmErrorTests(unittest.TestCase):
|
||||
td.stages = 0
|
||||
plan.construct(td)
|
||||
|
||||
with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
|
||||
td.stages = 3
|
||||
plan.construct(td)
|
||||
if cc < 90:
|
||||
with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
|
||||
td.stages = 3
|
||||
plan.construct(td)
|
||||
else:
|
||||
original_kschedule = td.kernel_schedule
|
||||
original_eschedule = td.epilogue_schedule
|
||||
with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
|
||||
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
|
||||
td.epilogue_schedule = cutlass.EpilogueScheduleType.NoSmemWarpSpecialized
|
||||
td.stages = 3
|
||||
plan.construct(td)
|
||||
|
||||
# Reset schedules
|
||||
td.kernel_schedule = original_kschedule
|
||||
td.epilogue_schedule = original_eschedule
|
||||
|
||||
with ExpectException(True, f'Requested too many stages'):
|
||||
td.stages = 100
|
||||
@ -335,9 +317,25 @@ class GemmErrorTests(unittest.TestCase):
|
||||
# Reset cluster shape
|
||||
td.cluster_shape = cluster_shape
|
||||
|
||||
kernel_schedule = td.kernel_schedule
|
||||
with ExpectException(cc < 90, f'Requested a persistent kernel on SM{cc}'):
|
||||
with ExpectException(cc < 90, f'Requested a non-auto schedule on SM{cc}'):
|
||||
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
|
||||
td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
|
||||
plan.construct(td)
|
||||
|
||||
with ExpectException(True, f'Requested a non-auto kernel schedule with an auto epilogue schedule'):
|
||||
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedPingpong
|
||||
td.epilogue_schedule = cutlass.EpilogueScheduleType.ScheduleAuto
|
||||
plan.construct(td)
|
||||
|
||||
with ExpectException(True, f'Requested an auto kernel schedule with a non-auto epilogue schedule'):
|
||||
td.kernel_schedule = cutlass.KernelScheduleType.ScheduleAuto
|
||||
td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecialized
|
||||
plan.construct(td)
|
||||
|
||||
with ExpectException(cc < 90, f'Requested a tile scheduler on SM{cc}'):
|
||||
td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative
|
||||
td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative
|
||||
td.tile_scheduler = cutlass.TileSchedulerType.StreamK
|
||||
plan.construct(td)
|
||||
|
||||
# Ensure that all returned tile descriptions are unique
|
||||
|
||||
65
test/python/interface/utils.py
Normal file
65
test/python/interface/utils.py
Normal file
@ -0,0 +1,65 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Helper functions & classes for interface test
|
||||
"""
|
||||
class ExpectException:
|
||||
"""
|
||||
Utility class to assert that an exception was raised when expected
|
||||
|
||||
Example:
|
||||
|
||||
.. highlight:: python
|
||||
.. code-block:: python
|
||||
|
||||
with ExceptionExpected(True, 'Division by zero'):
|
||||
x = 1.0 / 0.0
|
||||
|
||||
:param exception_expected: whether an exception is expected to be raised
|
||||
:type exception_expected: bool
|
||||
:param message: message to print if an exception is raised when not expected or vice versa
|
||||
:type message: str
|
||||
"""
|
||||
def __init__(self, exception_expected: bool, message: str = ''):
|
||||
self.exception_expected = exception_expected
|
||||
self.message = message
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, traceback):
|
||||
exception_raised = exc_type is not None
|
||||
assert self.exception_expected == exception_raised, self.message
|
||||
|
||||
# Suppress the exception
|
||||
return True
|
||||
Reference in New Issue
Block a user