CUTLASS 3.2.1 (#1113)

* Updates for 3.2.1 release.

* Minor fix in gemm op profiler for raster order.

* Add scheduler mapping for raster order in the kernels.
This commit is contained in:
ANIKET SHIVAM
2023-09-26 14:24:26 -07:00
committed by GitHub
parent e0aaa3c3b3
commit 90d3b0fb18
428 changed files with 22253 additions and 21762 deletions

View File

@ -1,233 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
from cutlass.backend.conv2d_operation import *
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dDgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float16)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float16)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float16)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float16)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,209 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dDgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=4,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3_64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4_64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=4,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,130 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
import cutlass.backend
from cutlass.backend.conv2d_operation import *
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dDgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[4, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[2, 4, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,127 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dDgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Unity,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,196 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
def conv2d_few_channel_problemsizes(channels):
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
cutlass_bindings.Tensor4DCoord(16, 7, 7, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
return problem_sizes
class Conv2dFpropFewChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.few_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(2)))
def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_1(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=1)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=1)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=2,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.few_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(1)))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,220 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
def conv2d_fixed_channel_problemsizes(channels):
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
return problem_sizes
class Conv2dFpropFixedChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(8)))
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(4)))
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(2)))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,341 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dFpropImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float16)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float16)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float16)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float16)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float16)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 28),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 28),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 23, 56, 100),
cutlass_bindings.Tensor4DCoord(128, 3, 3, 100),
cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,86 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dFpropImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,128 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
import cutlass.backend
from cutlass.backend.conv2d_operation import *
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dFpropImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[4, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle2
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[2, 4, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,139 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dFpropImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align2(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=2)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=2)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
)
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,285 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dStridedDgradImplicitGemmF16NHWCF16NHWCF32NHWCTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x256_64x3_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 256, 64], stages=3,
warp_count=[2, 4, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4_128x128_32x3_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
)
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
)
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 56, 56, 12),
cutlass_bindings.Tensor4DCoord(8, 1, 1, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 55, 55, 12),
cutlass_bindings.Tensor4DCoord(8, 1, 1, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,129 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dWgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment, math_inst.element_accumulator,
cutlass_bindings.float16
)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float16,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment, math_inst.element_accumulator,
cutlass_bindings.float16
)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,274 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dWgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_64x256_32x4_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[64, 256, 32], stages=3,
warp_count=[1, 4, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,128 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
import cutlass.backend
from cutlass.backend.conv2d_operation import *
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dWgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[2, 4, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=1)
tile_description = TileDescription(
threadblock_shape=[128, 128, 8], stages=4,
warp_count=[2, 4, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,139 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
# test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
from cutlass.backend.utils.device import device_cc
import unittest
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class Conv2dWgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=8)
tile_description = TileDescription(
threadblock_shape=[128, 128, 16], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
self.assertTrue(test_all_conv2d(operation))
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align1(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
A = TensorDescription(
element=math_inst.element_a,
layout=cutlass_bindings.TensorNHWC,
alignment=1)
B = TensorDescription(
element=math_inst.element_b,
layout=cutlass_bindings.TensorNHWC,
alignment=1)
C = TensorDescription(
element=cutlass_bindings.float32,
layout=cutlass_bindings.TensorNHWC,
alignment=4)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32], stages=3,
warp_count=[2, 2, 1],
math_instruction=math_inst
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
operation = Conv2dOperation(
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
arch=80, tile_description=tile_description, A=A, B=B, C=C,
stride_support=StrideSupport.Strided,
epilogue_functor=epilogue_functor,
swizzling_functor=cutlass_bindings.IdentitySwizzle1
)
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 8, 8, 1),
cutlass_bindings.Tensor4DCoord(1, 3, 3, 1),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
self.assertTrue(test_all_conv2d(operation, problem_sizes))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**26, 2**26)
unittest.main()

View File

@ -1,128 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
import unittest
from cutlass.backend.test.gemm_testbed import test_all_gemm
from cutlass.backend.utils.device import device_cc
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class GemmBF16TensorOpSm80(unittest.TestCase):
def SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32_64x128x64_32x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.bfloat16, element_b=cutlass_bindings.bfloat16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[64, 128, 64],
stages=4, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.bfloat16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.bfloat16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
alignment=4
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32_128x256x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.bfloat16, element_b=cutlass_bindings.bfloat16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[64, 128, 32],
stages=6, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.bfloat16, layout=cutlass_bindings.RowMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.bfloat16, layout=cutlass_bindings.RowMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.bfloat16, layout=cutlass_bindings.RowMajor,
alignment=8
)
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, cutlass_bindings.float32)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "multistage"))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**30, 2**30)
unittest.main()

View File

@ -1,138 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from functools import partial
import cutlass.backend
from cutlass.backend import *
from cutlass.backend import library
from cutlass.backend.test import *
import unittest
from cutlass.backend.test.utils import LayoutCombination, get_name
from cutlass.backend.test.gemm_testbed import test_all_gemm
from cutlass.backend.utils.device import device_cc
name_fn = partial(get_name, element_a=cutlass_bindings.bfloat16, element_b=cutlass_bindings.bfloat16, arch=90)
def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
cluster_shape, threadblock_shape, stages, opclass, persistent=False):
"""
Create a test-running function with the given specification and set it as a method of `cls`.
:param cls: class to which the generated method will be added
:type cls: type
:param layouts: indexable container of layouts of A, B, and C operands
:param alignments: indexable container of alignments of A, B, and C operands
:param element_output: data type of the output element
:param element_accumulator: data type used in accumulation
:param element_epilogue: data type used in computing the epilogue
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
:param threadblock_shape: indexable container of dimensions of threadblock tiles
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass_bindings.OpClass
:param persistent: whether this is a persistent warp-specialized kernel
:type persistent: bool
"""
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = cutlass_bindings.bfloat16
element_B = cutlass_bindings.bfloat16
inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
math_inst = MathInstruction(
instruction_shape=inst_shape,
element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
opcode_class=opclass, math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=threadblock_shape,
cluster_shape=cluster_shape,
stages=stages, warp_count=warp_count,
math_instruction=math_inst,
persistent=persistent
)
A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
epilogue_functor = LinearCombination(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=90, tile_description=tile_description, A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
self.assertTrue(test_all_gemm(operation, "universal"))
if persistent:
suffix = "_persistent"
else:
suffix = ""
name = name_fn(layouts, alignments, element_output, element_accumulator,
element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass, suffix=suffix)
setattr(cls, name, run)
return run
@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
class GemmBF16Sm90(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
add_test_tensorop = partial(add_test, opclass=cutlass_bindings.OpClass.TensorOp)
add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
add_test_tensorop(GemmBF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], 3)
add_test_tensorop(GemmBF16Sm90, LayoutCombination.NNN, [4, 4, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], 5)
add_test_tensorop(GemmBF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmBF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [128, 128, 32], None, persistent=True)
add_test_simt(GemmBF16Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 8], 2)
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**30, 2**30)
unittest.main()

View File

@ -1,479 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
import unittest
from cutlass.backend.test.gemm_testbed import test_all_gemm
from cutlass.backend.utils.device import device_cc
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class GemmF16Sm80(unittest.TestCase):
def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
alignment=4
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor,
direct_store=True
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32_128x128x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 64],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
alignment=8
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32_128x256x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 256, 64],
stages=3, warp_count=[2, 4, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
alignment=4
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32_256x128x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[256, 128, 64],
stages=3, warp_count=[4, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
alignment=4
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 64, 64],
stages=3, warp_count=[2, 1, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
alignment=4
)
element_epilogue = cutlass_bindings.float16
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32_64x64x32_32x32x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[64, 64, 32],
stages=10, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
alignment=4
)
element_epilogue = cutlass_bindings.float16
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32_256x128x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[256, 128, 64],
stages=3, warp_count=[4, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
alignment=8
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_test_SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 64, 64],
stages=3, warp_count=[2, 1, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
alignment=4
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32_128x256x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 256, 64],
stages=3, warp_count=[2, 4, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32_128x256x64_64x64x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16],
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 256, 64],
stages=3, warp_count=[2, 4, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
alignment=4
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**30, 2**30)
unittest.main()

View File

@ -1,182 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from functools import partial
import cutlass.backend
from cutlass.backend import *
from cutlass.backend import library
from cutlass.backend.test import *
import unittest
from cutlass.backend.test.utils import LayoutCombination, get_name
from cutlass.backend.test.gemm_testbed import test_all_gemm
from cutlass.backend.utils.device import device_cc
# Partial specialziation for naming tests
name_fn = partial(get_name, element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16, arch=90)
def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
cluster_shape, threadblock_shape, stages, opclass, persistent=False):
"""
Create a test-running function with the given specification and set it as a method of `cls`.
:param cls: class to which the generated method will be added
:type cls: type
:param layouts: indexable container of layouts of A, B, and C operands
:param alignments: indexable container of alignments of A, B, and C operands
:param element_output: data type of the output element
:param element_accumulator: data type used in accumulation
:param element_epilogue: data type used in computing the epilogue
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
:param threadblock_shape: indexable container of dimensions of threadblock tiles
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass_bindings.OpClass
:param persistent: whether this is a persistent warp-specialized kernel
:type persistent: bool
"""
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = cutlass_bindings.float16
element_B = cutlass_bindings.float16
inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
math_inst = MathInstruction(
instruction_shape=inst_shape,
element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
opcode_class=opclass, math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=threadblock_shape,
cluster_shape=cluster_shape,
stages=stages, warp_count=warp_count,
math_instruction=math_inst,
persistent=persistent
)
A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
epilogue_functor = LinearCombination(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=90, tile_description=tile_description, A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
self.assertTrue(test_all_gemm(operation, "universal"))
if persistent:
suffix = "_persistent"
else:
suffix = ""
name = name_fn(layouts, alignments, element_output, element_accumulator,
element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass, suffix=suffix)
setattr(cls, name, run)
return run
@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
class GemmF16Sm90(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
add_test_tensorop = partial(add_test, opclass=cutlass_bindings.OpClass.TensorOp)
add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
# Tests with 1x1x1 clusters
add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], 3)
add_test_tensorop(GemmF16Sm90, LayoutCombination.NNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.NTT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 64, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 64, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 32], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 64, 64], 5)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [2, 2, 2], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 32], None)
# Tests with different cluster shapes
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 4, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 4, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 1, 1], [64, 128, 64], None)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 2, 1], [64, 128, 64], None)
# Tests for persistent warp-specialized threadblocks
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 128, 64], None, persistent=True)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [64, 128, 64], None, persistent=True)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 64], None, persistent=True)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [128, 128, 64], None, persistent=True)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 2, 1], [64, 128, 64], None, persistent=True)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None, persistent=True)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 4, 1], [64, 128, 64], None, persistent=True)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 4, 1], [64, 128, 64], None, persistent=True)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 1, 1], [64, 128, 64], None, persistent=True)
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 4, 1], [64, 128, 64], None, persistent=True)
# Tests using SIMT
add_test_simt(GemmF16Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 8], 2)
add_test_simt(GemmF16Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 128, 8], 2)
add_test_simt(GemmF16Sm90, LayoutCombination.NTN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 64, 8], 2)
add_test_simt(GemmF16Sm90, LayoutCombination.TTN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 64, 8], 2)
add_test_simt(GemmF16Sm90, LayoutCombination.NNT, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 8], 2)
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**30, 2**30)
unittest.main()

View File

@ -1,178 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.memory_manager import get_allocated_size
from cutlass.backend.test import *
import unittest
from cutlass.backend.test.gemm_testbed import test_all_gemm
from cutlass.backend.utils.device import device_cc
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class GemmF32nF32nF32nTensorOpF32Sm80(unittest.TestCase):
def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add_fast_bf16
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
alignment=4
)
B = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
alignment=4
)
C = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
alignment=4
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_f32_128x128x32_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
alignment=4
)
B = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
alignment=4
)
C = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
alignment=4
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_fast_accurate_f32_64x64x32_32x32x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 8],
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add_fast_f32
)
tile_description = TileDescription(
threadblock_shape=[64, 64, 32],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
alignment=4
)
B = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
alignment=4
)
C = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
alignment=4
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**24, 2**24)
cutlass.backend.compiler.load_from_cache()
unittest.main()

View File

@ -1,134 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
import unittest
from cutlass.backend.test.gemm_testbed import test_all_gemm
from cutlass.backend.utils.device import device_cc
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class GemmF64TensorOpSm80(unittest.TestCase):
def test_SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64_32x32x16_16x16x16(self):
math_inst = MathInstruction(
instruction_shape=[8, 8, 4],
element_a=cutlass_bindings.float64, element_b=cutlass_bindings.float64,
element_accumulator=cutlass_bindings.float64, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[32, 32, 16],
stages=4, warp_count=[2, 2, 1],
math_instruction=math_inst
)
# alignment 1 restricted for double
A = TensorDescription(
element=cutlass_bindings.float64, layout=cutlass_bindings.ColumnMajor,
alignment=1
)
B = TensorDescription(
element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
alignment=1
)
C = TensorDescription(
element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
alignment=1
)
element_epilogue = cutlass_bindings.float64
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
def test_SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64_64x64x16_32x32x16(self):
math_inst = MathInstruction(
instruction_shape=[8, 8, 4],
element_a=cutlass_bindings.float64, element_b=cutlass_bindings.float64,
element_accumulator=cutlass_bindings.float64, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[64, 64, 16],
stages=4, warp_count=[2, 2, 1],
math_instruction=math_inst
)
# alignment 1 restricted for double
A = TensorDescription(
element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
alignment=1
)
B = TensorDescription(
element=cutlass_bindings.float64, layout=cutlass_bindings.ColumnMajor,
alignment=1
)
C = TensorDescription(
element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
alignment=1
)
element_epilogue = cutlass_bindings.float64
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "universal"))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**30, 2**30)
unittest.main()

View File

@ -1,124 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from functools import partial
import cutlass.backend
from cutlass.backend import *
from cutlass.backend import library
from cutlass.backend.test import *
import unittest
from cutlass.backend.test.utils import LayoutCombination, get_name
from cutlass.backend.test.gemm_testbed import test_all_gemm
from cutlass.backend.utils.device import device_cc
name_fn = partial(get_name, element_a=cutlass_bindings.float64, element_b=cutlass_bindings.float64, arch=90)
def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
cluster_shape, threadblock_shape, stages, opclass):
"""
Create a test-running function with the given specification and set it as a method of `cls`.
:param cls: class to which the generated method will be added
:type cls: type
:param layouts: indexable container of layouts of A, B, and C operands
:param alignments: indexable container of alignments of A, B, and C operands
:param element_output: data type of the output element
:param element_accumulator: data type used in accumulation
:param element_epilogue: data type used in computing the epilogue
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
:param threadblock_shape: indexable container of dimensions of threadblock tiles
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass_bindings.OpClass
"""
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = cutlass_bindings.float64
element_B = cutlass_bindings.float64
inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
math_inst = MathInstruction(
instruction_shape=inst_shape,
element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
opcode_class=opclass, math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=threadblock_shape,
cluster_shape=cluster_shape,
stages=stages, warp_count=warp_count,
math_instruction=math_inst
)
A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
epilogue_functor = LinearCombination(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=90, tile_description=tile_description, A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
self.assertTrue(test_all_gemm(operation, "universal"))
name = name_fn(layouts, alignments, element_output, element_accumulator,
element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass)
setattr(cls, name, run)
return run
@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
class GemmF64Sm90(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
add_test_simt(GemmF64Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass_bindings.float64, cutlass_bindings.float64, cutlass_bindings.float64, [1, 1, 1], [64, 64, 32], 2)
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**30, 2**30)
unittest.main()

View File

@ -1,235 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.test import *
import unittest
from cutlass.backend.test.gemm_grouped_testbed import TestbedGrouped
from cutlass.backend.utils.device import device_cc
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class GemmGroupedSm80(unittest.TestCase):
def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16], element_a=cutlass_bindings.float16,
element_b=cutlass_bindings.float16, element_accumulator=cutlass_bindings.float32,
opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
alignment=4
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
operation = GemmOperationGrouped(
80,
tile_description, A, B, C,
epilogue_functor, swizzling_functor,
precompute_mode=precompute_mode
)
testbed = TestbedGrouped(operation=operation)
self.assertTrue(testbed.run(24))
def test_SM80_Device_GemmGrouped_f64t_f64t_f64n_tensor_op_f64_64x64x16_32x32x16(self):
math_inst = MathInstruction(
instruction_shape=[8, 8, 4], element_a=cutlass_bindings.float64,
element_b=cutlass_bindings.float64, element_accumulator=cutlass_bindings.float64,
opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[64, 64, 16],
stages=4, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
alignment=1
)
B = TensorDescription(
element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
alignment=1
)
C = TensorDescription(
element=cutlass_bindings.float64, layout=cutlass_bindings.ColumnMajor,
alignment=1
)
element_epilogue = cutlass_bindings.float64
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
operation = GemmOperationGrouped(
80,
tile_description, A, B, C,
epilogue_functor, swizzling_functor,
precompute_mode=precompute_mode
)
testbed = TestbedGrouped(operation=operation)
self.assertTrue(testbed.run(24))
def test_SM80_Device_GemmGrouped_f32t_f32t_f32t_simt_f32_128x64x8_64x32x1(self):
math_inst = MathInstruction(
instruction_shape=[1, 1, 1], element_a=cutlass_bindings.float32,
element_b=cutlass_bindings.float32, element_accumulator=cutlass_bindings.float32,
opcode_class=cutlass_bindings.OpClass.Simt,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 64, 8],
stages=4, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
alignment=1
)
B = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
alignment=1
)
C = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
alignment=1
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
operation = GemmOperationGrouped(
80,
tile_description, A, B, C,
epilogue_functor, swizzling_functor,
precompute_mode=precompute_mode
)
testbed = TestbedGrouped(operation=operation)
self.assertTrue(testbed.run(27))
def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32_cache(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 16], element_a=cutlass_bindings.float16,
element_b=cutlass_bindings.float16, element_accumulator=cutlass_bindings.float32,
opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 32],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
B = TensorDescription(
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
alignment=8
)
C = TensorDescription(
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
alignment=4
)
element_epilogue = cutlass_bindings.float32
epilogue_functor = LinearCombination(
C.element, C.alignment,
math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
operation = GemmOperationGrouped(
80,
tile_description, A, B, C,
epilogue_functor, swizzling_functor,
precompute_mode=precompute_mode
)
testbed = TestbedGrouped(operation=operation)
self.assertTrue(testbed.run(5))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**30, 2**30)
unittest.main()

View File

@ -1,261 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import cutlass.backend
from cutlass.backend import *
from cutlass.backend.epilogue import LinearCombinationClamp
from cutlass.backend.test import *
import unittest
from cutlass.backend.test.gemm_testbed import test_all_gemm
from cutlass.backend.utils.device import device_cc
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
class GemmS8TensorOpF32Sm80(unittest.TestCase):
def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_64x64x64_32x32x64(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 32],
element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add_saturate
)
tile_description = TileDescription(
threadblock_shape=[64, 64, 64],
stages=6, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajorInterleaved32,
alignment=16
)
B = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajorInterleaved32,
alignment=16
)
C = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajorInterleaved32,
alignment=8
)
epilogue_functor = FastLinearCombinationClamp(
C.element, C.alignment
)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "interleaved"))
def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_256x128x128_64x64x128(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 32],
element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 128],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
alignment=16
)
B = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
alignment=16
)
C = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
alignment=16
)
epilogue_functor = FastLinearCombinationClamp(
C.element, C.alignment
)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "multistage"))
def test_SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32_128x128x128_64x64x128(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 32],
element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 128],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
alignment=16
)
B = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
alignment=16
)
C = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
alignment=16
)
epilogue_functor = FastLinearCombinationClamp(
C.element, C.alignment
)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "multistage"))
def test_SM80_Device_Gemm_s8t_s8n_s32n_tensor_op_s32_128x128x128_64x64x128(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 32],
element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 128],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
alignment=16
)
B = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
alignment=16
)
C = TensorDescription(
element=cutlass_bindings.int32, layout=cutlass_bindings.ColumnMajor,
alignment=4
)
element_epilogue = cutlass_bindings.int32
epilogue_functor = LinearCombinationClamp(
C.element, C.alignment, math_inst.element_accumulator,
element_epilogue
)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "multistage"))
def test_SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32_128x128x128_64x64x128(self):
math_inst = MathInstruction(
instruction_shape=[16, 8, 32],
element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=[128, 128, 128],
stages=3, warp_count=[2, 2, 1],
math_instruction=math_inst
)
A = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
alignment=16
)
B = TensorDescription(
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
alignment=16
)
C = TensorDescription(
element=cutlass_bindings.int32, layout=cutlass_bindings.RowMajor,
alignment=4
)
element_epilogue = cutlass_bindings.int32
epilogue_functor = LinearCombinationClamp(
C.element, C.alignment, math_inst.element_accumulator,
element_epilogue
)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=80, tile_description=tile_description,
A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
)
self.assertTrue(test_all_gemm(operation, "multistage"))
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**30, 2**30)
unittest.main()

View File

@ -1,154 +0,0 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from functools import partial
import cutlass.backend
from cutlass.backend import *
from cutlass.backend import library
from cutlass.backend.test import *
import unittest
from cutlass.backend.test.utils import LayoutCombination, get_name
from cutlass.backend.test.gemm_testbed import test_all_gemm
from cutlass.backend.utils.device import device_cc
name_fn = partial(get_name, element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16, arch=90)
def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
cluster_shape, threadblock_shape, stages, opclass, persistent=False):
"""
Create a test-running function with the given specification and set it as a method of `cls`.
:param cls: class to which the generated method will be added
:type cls: type
:param layouts: indexable container of layouts of A, B, and C operands
:param alignments: indexable container of alignments of A, B, and C operands
:param element_output: data type of the output element
:param element_accumulator: data type used in accumulation
:param element_epilogue: data type used in computing the epilogue
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
:param threadblock_shape: indexable container of dimensions of threadblock tiles
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass_bindings.OpClass
:param persistent: whether this is a persistent warp-specialized kernel
:type persistent: bool
"""
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = cutlass_bindings.int8
element_B = cutlass_bindings.int8
inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
math_inst = MathInstruction(
instruction_shape=inst_shape,
element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
opcode_class=opclass, math_operation=MathOperation.multiply_add
)
tile_description = TileDescription(
threadblock_shape=threadblock_shape,
cluster_shape=cluster_shape,
stages=stages, warp_count=warp_count,
math_instruction=math_inst,
persistent=persistent
)
A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
if opclass == cutlass_bindings.OpClass.Simt:
epilogue_functor_cls = LinearCombinationClamp
else:
epilogue_functor_cls = LinearCombination
epilogue_functor = epilogue_functor_cls(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
swizzling_functor = cutlass_bindings.IdentitySwizzle1
operation = GemmOperationUniversal(
arch=90, tile_description=tile_description, A=A, B=B, C=C,
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
self.assertTrue(test_all_gemm(operation, "universal"))
if persistent:
suffix = "_persistent"
else:
suffix = ""
name = name_fn(layouts, alignments, element_output, element_accumulator,
element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass, suffix=suffix)
setattr(cls, name, run)
return run
@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
class GemmS8Sm90(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
add_test_tensorop = partial(add_test, opclass=cutlass_bindings.OpClass.TensorOp)
add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
# Tests with 1x1x1 clusters
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNN, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], 3)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], None)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 8], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], None)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [64, 128, 128], None)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 64, 32], None)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [4, 4, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], None)
# Tests with different cluster shapes
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [2, 2, 1], [128, 128, 128], None)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 4, 1], [128, 128, 128], None)
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [4, 4, 1], [128, 128, 128], None)
# Tests with persistent warp-specialized threadblocks
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [2, 1, 1], [128, 128, 128], None, persistent=True)
# Tests for SIMT
add_test_simt(GemmS8Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [64, 32, 8], 2)
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**30, 2**30)
unittest.main()

View File

@ -1,508 +0,0 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Util Functions for Conv2d Test
"""
import torch
import cutlass
import unittest
import cutlass_bindings
from cutlass.utils.datatypes import binding_type, binding_opclass
from cutlass.backend.test.conv2d_testbed import Conv2dLauncher, getTensorRef, getTensorView
from cutlass.backend.utils.device import device_cc
from cutlass.backend.test.utils import get_name_conv2d
import numpy as np
def conv2d_few_channel_problemsizes(channels):
problem_sizes = [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
cutlass_bindings.Tensor4DCoord(16, 7, 7, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
cutlass_bindings.MatrixCoord(2, 2),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
return problem_sizes
torch_dtype = {
cutlass.DataType.f16: torch.float16,
cutlass.DataType.f32: torch.float32,
cutlass.DataType.f64: torch.float64
}
numpy_dtype = {
cutlass.DataType.f16: np.float16,
cutlass.DataType.f32: np.float32,
cutlass.DataType.f64: np.float64
}
def validate_problem_size(ps, conv_kind, split_k_slices):
P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
if P != ps.P or Q != ps.Q:
return False
# Split-K (serial or parallel) is not supported for strided dgrad
if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
return False
return True
# Override the backend launcher
class Conv2dLauncherFrontend(Conv2dLauncher):
def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
self.operation = plan
self.conv_kind = plan.conv_kind
self.seed = seed
self.backend = backend
self.dtype_A = plan._element_a
self.dtype_B = plan._element_b
self.dtype_C = plan._element_c
self.dtype_acc = plan._element_accumulator
self.layout_A = cutlass_bindings.TensorNHWC
self.layout_B = cutlass_bindings.TensorNHWC
self.layout_C = cutlass_bindings.TensorNHWC
self.layout_D = cutlass_bindings.TensorNHWC
self.element_compute = cutlass_bindings.float32
self.enable_cached_results = True
# Get randomization_max
if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
if self.dtype_acc in [cutlass.DataType.f16, cutlass.DataType.bf16]:
self.randomization_max = 2
else:
self.randomization_max = 3
else:
self.randomization_max = 7
self.activation = plan.activation
self.host_conv2d = cutlass_bindings.test.conv.host.conv2d
def set_seed(self):
if self.backend == "numpy":
np.random.seed(self.seed)
else:
torch.manual_seed(self.seed)
def uniform_init(self, size, dtype):
if self.backend == "numpy":
return super().uniform_init(size, numpy_dtype[dtype])
else:
tensor = torch.ceil(
torch.empty(size=size, dtype=torch_dtype[dtype], device="cuda").uniform_(-self.randomization_max - 0.5, self.randomization_max - 0.5)
).to(memory_format=torch.channels_last)
return tensor
def zeros_like(self, tensor):
if self.backend == "numpy":
return np.zeros_like(tensor)
else:
return torch.zeros_like(tensor).to(memory_format=torch.channels_last)
def reference(self, ps, A, B, C, alpha, beta, activation):
if self.backend == "numpy":
numpy_result = self.host_reference(ps, A, B, C, alpha, beta, activation)
return numpy_result
else:
if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
torch_result = alpha * torch.ops.aten.conv2d(
A,
B,
stride=(ps.stride_h, ps.stride_w),
padding=(ps.pad_h, ps.pad_w),
dilation=(ps.dilation_h, ps.dilation_w)
) + beta * C
elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
torch_result = alpha * torch.nn.grad.conv2d_input(
(ps.N, ps.C, ps.H, ps.W),
B,
A,
padding=(ps.pad_h, ps.pad_w),
stride=(ps.stride_h, ps.stride_w)
) + beta * C
elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
torch_result = alpha * torch.nn.grad.conv2d_weight(
B,
(ps.K, ps.C, ps.R, ps.S),
A,
padding=(ps.pad_h, ps.pad_w),
stride=(ps.stride_h, ps.stride_w)
) + beta * C
else:
raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
if activation == cutlass.backend.epilogue.relu:
torch_result = torch.nn.functional.relu(torch_result)
elif activation == cutlass.backend.epilogue.leaky_relu:
torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
return torch_result
def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta, activation):
if self.element_compute == cutlass_bindings.float16:
alpha = cutlass_bindings.float16(alpha)
beta = cutlass_bindings.float16(beta)
elif self.element_compute == cutlass_bindings.int32:
alpha = int(alpha)
beta = int(beta)
else:
alpha = alpha
beta = beta
# If cached result is loaded
cached_result_loaded = False
if self.enable_cached_results:
# Get problem key
cached_test_key = cutlass_bindings.test.conv.host.CreateCachedConv2dTestKey(
self.conv_kind,
problem_size,
alpha,
beta,
getTensorView(
tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
),
getTensorView(
tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
),
getTensorView(
tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
),
)
cached_test_key.problem = cached_test_key.problem + f"_{activation.tag.split('::')[-1]}"
cached_test_result = cutlass_bindings.test.conv.host.CachedTestResult()
conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (
self.operation.arch,
self.seed,
)
cached_results = cutlass_bindings.test.conv.host.CachedTestResultListing(
conv2d_result_cache_name
)
# CachedTestResultListing cached_results(conv2d_result_cache_name);
cached = cached_results.find(cached_test_key)
cached_result_loaded = cached[0]
if cached_result_loaded:
cached_test_result = cached[1]
if not cached_result_loaded:
# Compute the conv2d on host
tensor_D_ref = np.ones_like(tensor_C)
tensor_ref_A = getTensorRef(
tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
)
tensor_ref_B = getTensorRef(
tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
)
tensor_ref_C = getTensorRef(
tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
)
tensor_ref_D_ref = getTensorRef(
tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
)
self.host_conv2d(
self.conv_kind,
problem_size,
tensor_ref_A,
tensor_ref_B,
tensor_ref_C,
tensor_ref_D_ref,
alpha,
beta,
)
if activation == cutlass.backend.epilogue.leaky_relu:
tensor_D_ref = activation.numpy(tensor_D_ref, 0.5)
else:
tensor_D_ref = activation.numpy(tensor_D_ref)
tensor_view_D_ref = getTensorView(
tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
)
if self.enable_cached_results:
cached_test_result.D = cutlass_bindings.test.conv.host.TensorHash(
tensor_view_D_ref
)
cached_results = (
cutlass_bindings.test.conv.host.CachedTestResultListing(
conv2d_result_cache_name
)
)
cached_results.append(cached_test_key, cached_test_result)
cached_results.write(conv2d_result_cache_name)
else:
return tensor_D_ref
return cached_test_result.D
def equal(self, tensor_D, tensor_D_ref, problem_size):
if self.backend == "numpy":
return super().equal(tensor_D, tensor_D_ref, problem_size)
else:
torch.cuda.synchronize()
return torch.equal(tensor_D, tensor_D_ref)
def run(self, ps, split_k_mode=cutlass_bindings.conv.SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
#
# Initialize input and output tensors
#
if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
if self.backend == "torch":
tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
else:
tensor_A_size = (ps.N, ps.H, ps.W, ps.C)
tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
tensor_C_size = (ps.N, ps.P, ps.Q, ps.K)
elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
if self.backend == "torch":
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
else:
tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
tensor_C_size = (ps.N, ps.H, ps.W, ps.C)
elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
if self.backend == "torch":
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
else:
tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
tensor_B_size = (ps.N, ps.H, ps.W, ps.C)
tensor_C_size = (ps.K, ps.R, ps.S, ps.C)
else:
raise Exception(f"Conv kind {self.conv_kind} is not supported")
self.set_seed()
tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
tensor_D = self.zeros_like(tensor_C)
self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
stride=(ps.stride_h, ps.stride_w),
padding=(ps.pad_h, ps.pad_w),
dilation=(ps.dilation_h, ps.dilation_w),
alpha=alpha, beta=beta,
split_k=(split_k_mode, split_k_slices))
tensor_D_ref = self.reference(
ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation
)
return self.equal(tensor_D, tensor_D_ref, ps)
def add_test(
cls,
cc,
conv_kind,
problem_sizes,
element,
element_accumulator,
element_output,
opclass,
threadblock_shape,
warp_count,
instruction_shape,
stages,
iterator_algorithm=None,
swizzle=None,
split_k_mode="serial",
split_k_slices=1,
activation = "identity"
):
"""Create a test-running function with the given specification"""
test_name = get_name_conv2d(
cc, conv_kind, element, element_accumulator,
element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
def run(self):
# Create the plan
plan = cutlass.Conv2d(
kind=conv_kind,
element=element,
element_accumulator=element_accumulator,
element_C=element_output,
element_D=element_output
)
# Set the opclass
plan.opclass = opclass
# Set the tile description
td = {
"threadblock_shape": threadblock_shape,
"warp_count": warp_count,
"stages": stages,
"instruction_shape": instruction_shape,
}
plan.tile_description = td
# Set iterator algorithm
if iterator_algorithm is not None:
plan.iterator_algorithm = iterator_algorithm
# Set swizzling functor
if swizzle is not None:
plan.swizzling_stride = swizzle
if activation != "identity":
if activation == "leaky_relu":
plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
else:
plan.activation = getattr(cutlass.epilogue, activation)
conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="numpy")
for ps in problem_sizes:
if not validate_problem_size(ps, conv_kind, split_k_slices): continue
self.assertTrue(
conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 0.5)
)
setattr(cls, test_name, run)
return run
def get_conv_problems():
# 64: minimum channel size
conv_problems = list(cutlass_bindings.test.conv.TestbedConv2dProblemSizes(64).conv2d_default_sizes)
# Insert alignment 4 & 2 tests
conv_problems += [
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
1, 1
),
]
return conv_problems

View File

@ -0,0 +1,660 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Utilities for defining Conv2D problem sizes for testing.
This file was ported from the C++ version in test/unit/conv/device/conv2d_problems.h
"""
import cutlass
from cutlass import ConvMode
from cutlass.shape import Conv2DProblemSize
class TestbedConv2dProblemSizes:
def __init__(self, minimum_channel_size: int):
conv2d_default_sizes = self.initialize_conv2d_default_sizes(minimum_channel_size)
conv2d_rigorous_sizes = self.initialize_conv2d_rigorous_sizes(minimum_channel_size)
conv2d_resnet50_sizes = self.initialize_conv2d_resnet50_sizes(1)
conv2d_resnet50_sizes_perf = self.initialize_conv2d_resnet50_sizes(34)
grouped_sizes = self.initialize_conv2d_grouped_sizes()
# Filter all problems
self.all = []
for size_list in [conv2d_default_sizes, conv2d_rigorous_sizes, conv2d_resnet50_sizes, conv2d_resnet50_sizes_perf, grouped_sizes]:
for size in size_list:
if (size.C // size.groups) % minimum_channel_size == 0:
self.all.append(size)
def initialize_conv2d_default_sizes(self, minimum_channel_size):
# Small input size x stride (1,1)
# C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
conv2d_default_sizes = []
conv2d_default_sizes.append(Conv2DProblemSize(
1, 1, 1, minimum_channel_size,
8, 1, 1, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 1, 8, minimum_channel_size,
8, 1, 3, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 7, 8, minimum_channel_size,
8, 3, 3, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 7, 9, minimum_channel_size,
8, 4, 4, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
2, 7, 9, minimum_channel_size,
8, 5, 5, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
3, 7, 9, minimum_channel_size,
8, 6, 5, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
3, 7, 9, minimum_channel_size,
8, 6, 6, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
3, 7, 9, minimum_channel_size,
8, 7, 7, minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
##############################################
# Small input size x stride (2,2)
# C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
##############################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 11, 7, minimum_channel_size,
8, 1, 1, minimum_channel_size,
0, 0,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 11, 7, minimum_channel_size,
8, 3, 3, minimum_channel_size,
1, 1,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 13, 11, minimum_channel_size,
8, 1, 1, minimum_channel_size,
1, 1,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 17, 19, minimum_channel_size,
16, 2, 2, minimum_channel_size,
1, 1,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 23, 5, minimum_channel_size,
16, 3, 3, minimum_channel_size,
1, 1,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 13, 17, 8,
24, 3, 3, 8,
0, 0,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 23, 21, 8,
24, 3, 3, 8,
1, 1,
3, 3,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 20, 24, 8,
40, 3, 3, 8,
3, 3,
3, 3,
1, 1,
))
##########################################
# Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1)
##########################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 15, 19, 160,
224, 1, 1, 160,
0, 0,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 19, 37, 160,
224, 3, 3, 160,
1, 1,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 16, 16, 160,
224, 2, 3, 160,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 23, 21, 128,
224, 3, 3, 128,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 29, 37, 160,
224, 5, 5, 160,
2, 2,
1, 1,
1, 1,
))
##########################################
# C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
##########################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 15, 19, 32 + minimum_channel_size,
96, 3, 3, 32 + minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 16, 24, 64 + minimum_channel_size,
96, 3, 3, 64 + minimum_channel_size,
1, 1,
1, 1,
1, 1,
))
##########################################
# Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)
##########################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 13, 16, 288,
160, 5, 5, 288,
2, 2,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 55, 51, 256,
512, 1, 1, 256,
0, 0,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 71, 80, 32,
64, 5, 5, 32,
2, 2,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 224, 224, 8,
64, 7, 7, 8,
3, 3,
2, 2,
1, 1,
))
##########################################
# Medium input size stride (3, 3), filter (3, 3), non-default padding
##########################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 27, 23, 256,
512, 3, 3, 256,
0, 0,
3, 3,
1, 1,
))
##########################################
# Medium input size padding > stride, asymmetric filter, padding and striding
##########################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 27, 31, 256,
512, 3, 3, 256,
5, 7,
3, 4,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 27, 35, 256,
512, 7, 5, 256,
11, 7,
3, 5,
1, 1,
))
##########################################
# Medium input size *mixed* stride (1, 2) and (2, 1),
# filter (3, 3), default padding
##########################################
conv2d_default_sizes.append(Conv2DProblemSize(
1, 27, 27, 256,
512, 3, 3, 256,
1, 1,
1, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 27, 27, 256,
512, 3, 3, 256,
1, 1,
2, 1,
1, 1,
))
######################################/
# Additional input size
######################################/
conv2d_default_sizes.append(Conv2DProblemSize(
3, 28, 28, 256,
256, 2, 2, 256,
0, 0,
2, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
1, 32, 32, 16,
32, 3, 3, 16,
1, 1,
6, 2,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
32, 24, 32, 32,
32, 1, 2, 32,
0, 0,
1, 1,
1, 1,
))
conv2d_default_sizes.append(Conv2DProblemSize(
4, 2, 3, 256,
328, 3, 5, 256,
1, 1,
1, 1,
1, 1,
))
return conv2d_default_sizes
# Add a few large and rigorous convolution problem sizes
def initialize_conv2d_rigorous_sizes(self, minimum_channel_size):
sizes = []
if False:
sizes.append(Conv2DProblemSize.from_sizes(
(1, 124, 224, 2 * minimum_channel_size),
(24, 7, 7, 2 * minimum_channel_size),
))
sizes.append(Conv2DProblemSize.from_sizes(
(1, 233, 35, minimum_channel_size),
(24, 7, 5, minimum_channel_size),
))
return sizes
# Add resent50 layers to unit testing sizes
def initialize_conv2d_resnet50_sizes(self, batch_size):
conv2d_problem_vector = []
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 56, 56, 64,
256, 1, 1, 64,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 56, 56, 64,
64, 1, 1, 64,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 56, 56, 64,
64, 3, 3, 64,
1, 1,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 56, 56, 256,
64, 1, 1, 256,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 56, 56, 256,
512, 1, 1, 256,
0, 0,
2, 2,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 56, 56, 256,
128, 1, 1, 256,
0, 0,
2, 2,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 28, 28, 128,
128, 3, 3, 128,
1, 1,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 28, 28, 128,
512, 1, 1, 128,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 28, 28, 512,
128, 1, 1, 512,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 28, 28, 512,
1024, 1, 1, 512,
0, 0,
2, 2,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 28, 28, 512,
256, 1, 1, 512,
0, 0,
2, 2,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 14, 14, 256,
256, 3, 3, 256,
1, 1,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 14, 14, 256,
1024, 1, 1, 256,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 14, 14, 1024,
256, 1, 1, 1024,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 14, 14, 1024,
2048, 1, 1, 1024,
0, 0,
2, 2,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 14, 14, 1024,
512, 1, 1, 1024,
0, 0,
2, 2,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 7, 7, 512,
512, 3, 3, 512,
1, 1,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 7, 7, 512,
2048, 1, 1, 512,
0, 0,
1, 1,
1, 1,
))
conv2d_problem_vector.append(Conv2DProblemSize(
batch_size, 7, 7, 2048,
512, 1, 1, 2048,
0, 0,
1, 1,
1, 1,
))
return conv2d_problem_vector
def initialize_conv2d_grouped_sizes(self):
threadblock_n = 128
threadblock_k = 32
sizes = []
##########################################
# One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
# One CTA calculates a single group
##########################################
for cta_per_group_k in range(1, 4):
for groups in range(2, 5):
conv_k = cta_per_group_k * threadblock_n * groups
sizes.append(Conv2DProblemSize(
1, 8, 8, threadblock_k * 2 * groups,
conv_k, 3, 3, threadblock_k * 2,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
groups
))
# Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
sizes.append(Conv2DProblemSize(
1, 8, 8, threadblock_k,
threadblock_n * 2, 3, 3, threadblock_k // 2,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
2
))
sizes.append(Conv2DProblemSize(
1, 56, 56, 696,
768, 3, 3, 232,
1, 1,
2, 2,
1, 1,
ConvMode.CrossCorrelation,
1,
3
))
sizes.append(Conv2DProblemSize(
1, 14, 14, 1392,
1536, 3, 3, 232,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
3
))
##########################################
# One CTA calculate multiple groups: CTA::N % k_per_group = 0
##########################################
# 2 groups per CTA
sizes.append(Conv2DProblemSize(
1, 8, 8, threadblock_k * 4,
threadblock_n, 3, 3, threadblock_k * 2,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
2
))
# 2 groups per CTA and partial gemm_k
sizes.append(Conv2DProblemSize(
1, 8, 8, threadblock_k,
threadblock_n, 3, 3, threadblock_k // 2,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
2
))
# 4 groups per CTA
sizes.append(Conv2DProblemSize(
1, 8, 8, threadblock_k * 8,
threadblock_n // 2, 3, 3, threadblock_k * 2,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
4
))
# 4 groups per CTA and partial gemm_k
sizes.append(Conv2DProblemSize(
1, 8, 8, threadblock_k * 2,
threadblock_n // 2, 3, 3, threadblock_k // 2,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1,
4
))
return sizes

View File

@ -31,56 +31,64 @@
#################################################################################################
"""
Low-level functionality tests for Conv2d operands on SM80
Low-level functionality tests for Conv2d opreations on SM80
"""
from conv2d_test_utils import *
import cutlass
import logging
import unittest
import cutlass
from cutlass.backend.utils.device import device_cc
from conv2d_test_utils import *
cutlass.set_log_level(logging.WARNING)
cc = 80
@unittest.skipIf(device_cc() != cc, 'Device compute capability is invalid for SM80 tests.')
@unittest.skipIf(device_cc() < cc, 'Device compute capability is invalid for SM80 tests.')
class Conv2dSm80(unittest.TestCase):
"""
Wrapper class to which tests will be added dynamically in __main__
"""
pass
conv_problems = get_conv_problems()
# Tests for optimized & analytic
for conv_kind in ["fprop", "wgrad", "dgrad"]:
# F16, simt
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="simt", threadblock_shape=[128, 128, 8],
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="simt", threadblock_shape=[128, 128, 8],
warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
# F16, tensor op
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
# F16, tensor op, analytic iterator
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
# F16, tensor op, f32 output
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
# F16, tensor op, different tile description
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 64, 32],
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 64, 32],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
# F32, simt
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
opclass="simt", threadblock_shape=[128, 128, 8],
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
opclass="simt", threadblock_shape=[128, 128, 8],
warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
# Tf32, tensorop
add_test(
@ -90,19 +98,19 @@ for conv_kind in ["fprop", "wgrad", "dgrad"]:
)
# Split-K
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
split_k_slices=2)
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
split_k_slices=5)
# Swizzling functor
add_test(
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 64, 32],
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 64, 32],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)
# Tests for few channels and fixed channels
@ -113,14 +121,14 @@ for c, tb, stage, inst in zip([2, 1],
[[16, 8, 16], [16, 8, 8]]):
add_test(
Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=tb,
opclass="tensor_op", threadblock_shape=tb,
warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
)
# F16, tensor op, fixed channels
for c in [8, 4, 2]:
add_test(
Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
)
@ -128,11 +136,11 @@ for c in [8, 4, 2]:
for activation in ["relu", "leaky_relu"]:
for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
add_test(
Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
opclass="tensor_op", threadblock_shape=[128, 128, 64],
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
split_k_slices=split_k_slices, activation=activation)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,425 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Utility functions for Conv2d tests.
"""
import torch
import cutlass
from cutlass import (
ConvKind,
ConvMode,
DataType,
DataTypeNames,
EpilogueScheduleSuffixes,
KernelScheduleSuffixes,
LayoutType,
OpcodeClassNames,
ShortDataTypeNames,
ShortLayoutTypeNames,
SplitKMode,
)
from cutlass.backend.utils.software import SubstituteTemplate
from cutlass.shape import Conv2DProblemSize
from cutlass.utils.datatypes import numpy_type, torch_type
from conv2d_problem_sizes import TestbedConv2dProblemSizes
def get_name_conv2d(
arch,
conv_kind,
element,
element_accumulator,
element_output,
opclass,
threadblock_shape,
warp_count,
instruction_shape,
stages,
iterator_algorithm,
swizzle,
split_k_mode,
split_k_slices,
activation
):
"""
Generates a procedural name for a test case for conv2d
:param arch: compute capability of kernel being generated
:type arch: int
:param conv_kind: the convolution type (i.e. fprop, dgrad, wgrad)
:type conv_kind: str
:param iterator_algorithm: the iterator algorithm applied
:type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
:param element_a: data type of operand A
:param element_b: data type of operand B
:param element_c: data type of operand C
:param element_accumulator: data type used in accumulation
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpcodeClass
:param threadblock_shape: indexable container of dimensions of threadblock tiles
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param stride_support: stride support of dgrad
:param alignment: int
:type alignment: int
:return: str
"""
if iterator_algorithm is None:
iterator_algorithm = "AUTO"
if swizzle is None:
swizzle = 1
name_format = "test_SM${arch}_Device_Conv2d_${conv_kind}_${iter_alg}_ImplicitGemm_${eA}nhwc_${eB}nhwc_${eC}nhwc_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${wM}x${wN}x${wK}_${IM}${IN}${IK}_stage${stages}_swizzle${swizzle}_${split_k_mode}${split_k_slices}_${activation}"
return SubstituteTemplate(
name_format,
{
"arch": str(arch),
"conv_kind": conv_kind,
"iter_alg": iterator_algorithm,
"eA": DataTypeNames[element],
"eB": DataTypeNames[element],
"eC": DataTypeNames[element_output],
"opclass": opclass,
"acc": DataTypeNames[element_accumulator],
"tbM": str(threadblock_shape[0]),
"tbN": str(threadblock_shape[1]),
"tbK": str(threadblock_shape[2]),
"wM": str(threadblock_shape[0] // warp_count[0]),
"wN": str(threadblock_shape[1] // warp_count[1]),
"wK": str(threadblock_shape[2] // warp_count[2]),
"IM": str(instruction_shape[0]),
"IN": str(instruction_shape[1]),
"IK": str(instruction_shape[2]),
"stages": str(stages),
"swizzle": str(swizzle),
"split_k_mode": split_k_mode,
"split_k_slices": str(split_k_slices),
"activation": activation
}
)
def conv2d_few_channel_problemsizes(channels):
problem_sizes = [
Conv2DProblemSize(
1, 8, 8, channels,
16, 3, 3, channels,
1, 1,
2, 2,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 16, 16, channels,
16, 3, 3, channels,
1, 1,
2, 2,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 16, 16, channels,
16, 7, 7, channels,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 224, 224, channels,
32, 7, 7, channels,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 224, 224, channels,
64, 7, 7, channels,
1, 1,
2, 2,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 224, 224, channels,
64, 5, 5, channels,
1, 1,
1, 1,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 224, 224, channels,
64, 5, 5, channels,
1, 1,
2, 2,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
]
return problem_sizes
def validate_problem_size(ps, conv_kind, split_k_slices):
P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
if P != ps.P or Q != ps.Q:
return False
# Split-K (serial or parallel) is not supported for strided dgrad
if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
return False
return True
class Conv2dLauncherFrontend:
def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
self.operation = plan
self.conv_kind = plan.conv_kind
self.seed = seed
self.backend = backend
self.dtype_A = plan._element_a
self.dtype_B = plan._element_b
self.dtype_C = plan._element_c
self.dtype_acc = plan._element_accumulator
self.layout_A = LayoutType.TensorNHWC
self.layout_B = LayoutType.TensorNHWC
self.layout_C = LayoutType.TensorNHWC
self.layout_D = LayoutType.TensorNHWC
self.element_compute = DataType.f32
if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
self.rand_max = 1
else:
self.rand_max = 4
self.activation = plan.activation
def uniform_init(self, size, dtype):
tensor = torch.ceil(
torch.empty(size=size, dtype=torch_type(dtype), device="cuda").uniform_(-self.rand_max - 0.5, self.rand_max - 0.5)
).to(memory_format=torch.channels_last)
return tensor
def reference(self, ps, A, B, C, alpha, beta, activation):
if self.conv_kind == ConvKind.Fprop:
torch_result = alpha * torch.ops.aten.conv2d(
A,
B,
stride=(ps.stride_h, ps.stride_w),
padding=(ps.pad_h, ps.pad_w),
dilation=(ps.dilation_h, ps.dilation_w)
) + beta * C
elif self.conv_kind == ConvKind.Dgrad:
torch_result = alpha * torch.nn.grad.conv2d_input(
(ps.N, ps.C, ps.H, ps.W),
B,
A,
padding=(ps.pad_h, ps.pad_w),
stride=(ps.stride_h, ps.stride_w)
) + beta * C
elif self.conv_kind == ConvKind.Wgrad:
torch_result = alpha * torch.nn.grad.conv2d_weight(
B,
(ps.K, ps.C, ps.R, ps.S),
A,
padding=(ps.pad_h, ps.pad_w),
stride=(ps.stride_h, ps.stride_w)
) + beta * C
else:
raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
if activation == cutlass.backend.epilogue.relu:
torch_result = torch.nn.functional.relu(torch_result)
elif activation == cutlass.backend.epilogue.leaky_relu:
torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
return torch_result
def run(self, ps, split_k_mode=SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
if self.conv_kind == ConvKind.Fprop:
tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
elif self.conv_kind == ConvKind.Dgrad:
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
elif self.conv_kind == ConvKind.Wgrad:
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
else:
raise Exception(f"Conv kind {self.conv_kind} is not supported")
torch.manual_seed(self.seed)
tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
tensor_D = torch.zeros_like(tensor_C).to(memory_format=torch.channels_last)
self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
stride=(ps.stride_h, ps.stride_w),
padding=(ps.pad_h, ps.pad_w),
dilation=(ps.dilation_h, ps.dilation_w),
alpha=alpha, beta=beta,
split_k=(split_k_mode, split_k_slices))
tensor_D_ref = self.reference(ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation)
torch.cuda.synchronize()
passed = torch.equal(tensor_D, tensor_D_ref)
return passed
def add_test(
cls,
cc,
conv_kind,
problem_sizes,
element,
element_accumulator,
element_output,
opclass,
threadblock_shape,
warp_count,
instruction_shape,
stages,
iterator_algorithm=None,
swizzle=None,
split_k_mode="serial",
split_k_slices=1,
activation = "identity"
):
"""Create a test-running function with the given specification"""
test_name = get_name_conv2d(
cc, conv_kind, element, element_accumulator,
element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
def run(self):
# Create the plan
plan = cutlass.Conv2d(
kind=conv_kind,
element=element,
element_accumulator=element_accumulator,
element_C=element_output,
element_D=element_output
)
# Set the opclass
plan.opclass = opclass
# Set the tile description
td = {
"threadblock_shape": threadblock_shape,
"warp_count": warp_count,
"stages": stages,
"instruction_shape": instruction_shape,
}
plan.tile_description = td
# Set iterator algorithm
if iterator_algorithm is not None:
plan.iterator_algorithm = iterator_algorithm
# Set swizzling functor
if swizzle is not None:
plan.swizzling_stride = swizzle
if activation != "identity":
if activation == "leaky_relu":
plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
else:
plan.activation = getattr(cutlass.epilogue, activation)
conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="torch")
for ps in problem_sizes:
if not validate_problem_size(ps, conv_kind, split_k_slices): continue
self.assertTrue(conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 2.0))
setattr(cls, test_name, run)
return run
def get_conv_problems():
# 64: minimum channel size
conv_problems = TestbedConv2dProblemSizes(64).all
# Insert alignment 4 & 2 tests
conv_problems += [
Conv2DProblemSize(
1, 4, 4, 12,
8, 3, 3, 12,
0, 0,
3, 3,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 4, 4, 14,
8, 3, 3, 14,
0, 0,
3, 3,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
Conv2DProblemSize(
1, 23, 56, 98,
128, 3, 3, 98,
4, 5,
3, 3,
1, 1,
ConvMode.CrossCorrelation,
1, 1
),
]
return conv_problems

View File

@ -1,6 +1,6 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
@ -30,13 +30,15 @@
#
#################################################################################################
import cutlass.backend
import pathlib
import unittest
from cutlass.backend.memory_manager import *
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**32, 2**32)
loader = unittest.TestLoader()
tests = loader.discover('./', 'conv2d_*.py')
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
tests = loader.discover(script_dir, 'conv2d_*.py')
testRunner = unittest.runner.TextTestRunner()
testRunner.run(tests)
results = testRunner.run(tests)
if not results.wasSuccessful():
raise Exception('Test cases failed')

View File

@ -39,7 +39,6 @@ import tempfile
import unittest
import cutlass
import cutlass_bindings
if cutlass.utils.datatypes.torch_available:
import torch
@ -94,7 +93,7 @@ def _generate_conv2d_problem(conv_kind, dtype, ps):
:type conv_kind: str
:param dtype: data type of tensors
:param problem_size: the conv2d problem size
:type problem_size: cutlass_bindings.conv.Conv2dProblemSize
:type problem_size: cutlass.shape.Conv2DProblemSize
:return: initialized tensors A, B, C, and D
:rtype: list
@ -196,13 +195,11 @@ class PyTorchExtensionTest(unittest.TestCase):
with tempfile.TemporaryDirectory() as tmpdir:
mod = cutlass.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
problem_size = cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
problem_size = cutlass.shape.Conv2DProblemSize(
1, 4, 4, 16,
8, 3, 3, 16,
0, 0,
3, 3,
1, 1
)
@ -239,13 +236,13 @@ class PyTorchExtensionTest(unittest.TestCase):
with tempfile.TemporaryDirectory() as tmpdir:
mod = cutlass.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
problem_size = cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
problem_size = cutlass.shape.Conv2DProblemSize(
1, 4, 4, 16,
8, 3, 3, 16,
0, 0,
3, 3,
1, 1,
cutlass.ConvMode.CrossCorrelation,
1, 1
)
@ -273,13 +270,13 @@ class PyTorchExtensionTest(unittest.TestCase):
with tempfile.TemporaryDirectory() as tmpdir:
mod = cutlass.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
problem_size = cutlass_bindings.conv.Conv2dProblemSize(
cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
cutlass_bindings.MatrixCoord(3, 3),
cutlass_bindings.MatrixCoord(1, 1),
cutlass_bindings.conv.Mode.cross_correlation,
problem_size = cutlass.shape.Conv2DProblemSize(
1, 4, 4, 16,
8, 3, 3, 16,
0, 0,
3, 3,
1, 1,
cutlass.ConvMode.CrossCorrelation,
1, 1
)

View File

@ -0,0 +1,100 @@
################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
"""
Unit test for compute node in SM90
"""
import logging
import unittest
import cutlass
from cutlass.backend import *
from cutlass.epilogue import *
from cutlass import swizzle
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
cutlass.set_log_level(logging.WARNING)
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
class TestEVTComputeSM90(EVTTestCaseBase):
def test_arith(self):
"""
Test Arithmatic op
"""
def evt_arith_compute(accum, C, alpha, beta, gamma):
D = ((accum + C) * alpha - gamma) / beta
return D
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"C": self.fake_tensor(self.element, (l, m, n)),
"alpha": 1.5,
"beta": 0.5,
"gamma": 2.5,
"D": self.fake_tensor(self.element, (l, m, n))
}
launcher = EVTTestBed(self.element, evt_arith_compute, example_inputs)
input_keys = ["C", "alpha", "beta", "gamma"]
result_keys = ["D"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_func_call(self):
"""
Test Function call
"""
def evt_func_call(accum, C, alpha, beta, gamma):
D = multiply_add(relu(accum + alpha) + C, beta, gamma)
return D
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"C": self.fake_tensor(self.element, (l, m, n)),
"alpha": 1.5,
"beta": 0.5,
"gamma": 2.5,
"D": self.fake_tensor(self.element, (l, m, n))
}
launcher = EVTTestBed(self.element, evt_func_call, example_inputs)
input_keys = ["C", "alpha", "beta", "gamma"]
result_keys = ["D"]
launcher.verify((m, n, k), input_keys, result_keys, l)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,173 @@
################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
"""
Unit test for store nodes in SM90
"""
import logging
import unittest
import cutlass
from cutlass.backend import *
from cutlass.epilogue import *
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
cutlass.set_log_level(logging.WARNING)
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
class TestEVTLayoutSM90(EVTTestCaseBase):
def test_permute_1(self):
"""
Returning a tensor with shape [m, n]
"""
def evt_permute(accum, alpha, C):
F = alpha * accum
F_permute = permute(F, indices=(0, 2, 1))
D_permute = F_permute + permute(C, indices=(0, 2, 1))
D = permute(D_permute, indices=(0, 2, 1))
return D, F
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 0.5,
"C": self.fake_tensor(self.element, (l, m, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_permute, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F"]
launcher.verify((m, n, k), input_keys, result_keys, l)
@unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
def test_permute_2(self):
"""
Returning a tensor with shape [m, n]
"""
def evt_permute(accum, alpha, C):
F = alpha * accum
F_permute = permute(F, indices=(0, 2, 1))
D = F_permute + C
return D, F
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 0.5,
"C": self.fake_tensor(self.element, (l, n, m)),
"F": self.fake_tensor(self.element, (l, m, n)),
"D": self.fake_tensor(self.element, (l, n, m)),
}
launcher = EVTTestBed(self.element, evt_permute, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F"]
launcher.verify((m, n, k), input_keys, result_keys, l)
@unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
def test_permute_3(self):
"""
Returning a tensor with shape [m, n]
"""
def evt_permute(accum, alpha, C):
F = alpha * accum
F_permute = permute(F, indices=(1, 0, 2))
D = F_permute + C
return D, F
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 0.5,
"C": self.fake_tensor(self.element, (m, l, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"D": self.fake_tensor(self.element, (m, l, n)),
}
launcher = EVTTestBed(self.element, evt_permute, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_reshape(self):
"""
Test reshape
"""
def evt_reshape(accum, alpha, TensorE):
F = alpha * accum
E_reshape = reshape(TensorE, new_shape=(512, 1))
D = F + E_reshape
return D
example_inputs = {
"accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
"alpha": 0.5,
"TensorE": self.fake_tensor(self.element, (16, 32)),
"D": self.fake_tensor(self.element, (self.l, self.m, self.n)),
}
launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
input_keys = ["alpha", "TensorE"]
result_keys = ["D"]
launcher.verify(self.problem_size, input_keys, result_keys, self.l)
def test_reshape2(self):
"""
Test reshape
"""
def evt_reshape(accum, alpha, TensorE):
F = alpha * accum
F_reshape = reshape(F, new_shape=(2, 3, 512, 256))
D = F_reshape + TensorE
return D
example_inputs = {
"accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
"alpha": 0.5,
"TensorE": self.fake_tensor(self.element, (2, 3, 1, self.n)),
"D": self.fake_tensor(self.element, (2, 3, self.m, self.n)),
}
launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
input_keys = ["alpha", "TensorE"]
result_keys = ["D"]
launcher.verify(self.problem_size, input_keys, result_keys, self.l)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,142 @@
################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
"""
Unit test for load nodes in SM90
"""
import logging
import unittest
import cutlass
from cutlass.backend import *
from cutlass.epilogue import *
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
cutlass.set_log_level(logging.WARNING)
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
class TestEVTLoadSM90(EVTTestCaseBase):
def test_tensor_load(self):
"""
Load extra tensor with shape [m, n]
"""
def evt_tensor_load(accum, C, aux, aux_batch):
D = accum + C + aux + aux_batch
return D
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"C": self.fake_tensor(self.element, (l, m, n)),
"aux": self.fake_tensor(self.element, (m, n)),
"aux_batch": self.fake_tensor(np.float32, (l, m, n)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_tensor_load, example_inputs)
input_keys = ["C", "aux", "aux_batch"]
result_keys = ["D"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_row_broadcast(self):
"""
Load extra tensor with shape [1, n]
"""
def evt_row_broadcast(accum, C, bias, bias_batch):
D = accum + C + bias + bias_batch
return D
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"C": self.fake_tensor(self.element, (l, m, n)),
"bias": self.fake_tensor(self.element, (n,)),
"bias_batch": self.fake_tensor(np.float32, (l, 1, n)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_row_broadcast, example_inputs)
input_keys = ["C", "bias", "bias_batch"]
result_keys = ["D"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_column_broadcast(self):
"""
Load extra tensor with shape [m, 1]
"""
def evt_column_broadcast(accum, C, bias, bias_batch):
D = accum + C + bias + bias_batch
return D
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"C": self.fake_tensor(self.element, (l, m, n)),
"bias": self.fake_tensor(self.element, (m, 1)),
"bias_batch": self.fake_tensor(np.float32, (l, m, 1)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_column_broadcast, example_inputs)
input_keys = ["C", "bias", "bias_batch"]
result_keys = ["D"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_scalar_broadcast(self):
"""
Load extra tensor with shape [1, 1]
"""
def evt_scalar_broadcast(accum, C, alpha, alpha_batch):
D = accum + C + alpha + alpha_batch
return D
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"C": self.fake_tensor(self.element, (l, m, n)),
"alpha": 0.5,
"alpha_batch": self.fake_tensor(np.float32, (l, 1, 1)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_scalar_broadcast, example_inputs)
input_keys = ["C", "alpha", "alpha_batch"]
result_keys = ["D"]
launcher.verify((m, n, k), input_keys, result_keys, l)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,274 @@
################################################################################
#
# Copyright (c) 2023 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
"""
Unittest for mixed types of nodes in SM90
"""
import logging
import unittest
import cutlass
from cutlass.backend import *
from cutlass.epilogue import *
from cutlass.swizzle import ThreadblockSwizzleStreamK
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
cutlass.set_log_level(logging.WARNING)
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
class TestEVTMixedSM90(EVTTestCaseBase):
def test_mixed_dag(self):
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
F = alpha * accum + (beta * C + aux)
F_row_max = max(F, dim=[0, 1])
E = relu(F + 1) + cbias + rbias
E_col_max = max(E, dim=[0, 2])
D = E + F
return D, F, F_row_max, E_col_max
if device_cc() == 80:
aligments = [2, 4, 8]
else:
# Sm90 EVT currently only supports 128-bit alignment
aligments = [8,]
for align in aligments:
for m, n, k, l in self.get_problem_sizes(align):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 1.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"beta": 1.0,
"aux": self.fake_tensor(self.element, (l, m, n)),
"cbias": self.fake_tensor(self.element, (m, 1)),
"rbias": self.fake_tensor(self.element, (n,)),
"D": self.fake_tensor(self.element, (l, m, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
}
launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs)
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
result_keys = ["D", "F", "F_row_max", "E_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
def test_mixed_dag_float(self):
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
F = alpha * accum + (beta * C + aux)
F_row_max = max(F, dim=[0, 1])
E = relu(F + 1) + cbias + rbias
E_col_max = max(E, dim=[0, 2])
D = E + F
return D, F, F_row_max, E_col_max
for align in [3, 2, 4]:
for m, n, k, l in self.get_problem_sizes(align):
example_inputs = {
"accum": self.fake_tensor(np.float32, (l, m, n)),
"alpha": 1.0,
"C": self.fake_tensor(np.float32, (l, m, n)),
"beta": 1.0,
"aux": self.fake_tensor(np.float32, (l, m, n)),
"cbias": self.fake_tensor(np.float32, (m, 1)),
"rbias": self.fake_tensor(np.float32, (n,)),
"D": self.fake_tensor(np.float32, (l, m, n)),
"F": self.fake_tensor(np.float32, (l, m, n)),
"F_row_max": self.fake_tensor(np.float32, (n,)),
"E_col_max": self.fake_tensor(np.float32, (m, 1))
}
launcher = EVTTestBed(DataType.f32, evt_mixed_dag, example_inputs)
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
result_keys = ["D", "F", "F_row_max", "E_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
def test_mixed_dag_stage2(self):
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
F = alpha * accum + (beta * C + aux)
F_row_max = max(F, dim=[0, 1])
E = relu(F + 1) + cbias + rbias
E_col_max = max(E, dim=[0, 2])
D = E + F
return D, F, F_row_max, E_col_max
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 1.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"beta": 1.0,
"aux": self.fake_tensor(self.element, (l, m, n)),
"cbias": self.fake_tensor(self.element, (m, 1)),
"rbias": self.fake_tensor(self.element, (n,)),
"D": self.fake_tensor(self.element, (l, m, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
}
launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, epilogue_stages=2)
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
result_keys = ["D", "F", "F_row_max", "E_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
def test_mixed_dag_partition_k(self):
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
F = alpha * accum + (beta * C + aux)
F_row_max = max(F, dim=[0, 1])
E = relu(F + 1) + cbias + rbias
E_col_max = max(E, dim=[0, 2])
D = E + F
return D, F, F_row_max, E_col_max
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 1.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"beta": 1.0,
"aux": self.fake_tensor(self.element, (l, m, n)),
"cbias": self.fake_tensor(self.element, (m, 1)),
"rbias": self.fake_tensor(self.element, (n,)),
"D": self.fake_tensor(self.element, (l, m, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
}
tile_description = {
"threadblock_shape": [128, 128, 64],
"warp_count": [2, 2, 2]
}
launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, tile_description=tile_description, epilogue_stages=2)
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
result_keys = ["D", "F", "F_row_max", "E_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
def test_mixed_dag_stream_k(self):
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
F = alpha * accum + (beta * C + aux)
F_row_max = max(F, dim=[0, 1])
E = relu(F + 1) + cbias + rbias
E_col_max = max(E, dim=[0, 2])
D = E + F
return D, F, F_row_max, E_col_max
# High per-sm occupancy tile_description
tile_description = {
"threadblock_shape": [128, 128, 32],
"warp_count": [2, 2, 1],
"stages": 3
}
tds = [None, tile_description]
for td in tds:
for m, n, k, l in self.get_problem_sizes(8, k=960, batch_count=[1, 3]):
if l == 1:
example_inputs = {
"accum": self.fake_tensor(self.element, (m, n)),
"alpha": 1.0,
"C": self.fake_tensor(self.element, (m, n)),
"beta": 1.0,
"aux": self.fake_tensor(self.element, (m, n)),
"cbias": self.fake_tensor(self.element, (m, 1)),
"rbias": self.fake_tensor(self.element, (n,)),
"D": self.fake_tensor(self.element, (m, n)),
"F": self.fake_tensor(self.element, (m, n)),
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
}
else:
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 1.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"beta": 1.0,
"aux": self.fake_tensor(self.element, (l, m, n)),
"cbias": self.fake_tensor(self.element, (m, 1)),
"rbias": self.fake_tensor(self.element, (n,)),
"D": self.fake_tensor(self.element, (l, m, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
}
if td is not None:
launcher = EVTTestBed(
self.element, evt_mixed_dag, example_inputs,
tile_description=td,
swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
else:
launcher = EVTTestBed(
self.element, evt_mixed_dag, example_inputs,
swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
result_keys = ["D", "F", "F_row_max", "E_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_mixed_dag_no_batch(self):
def evt_mixed_dag_no_batch(accum, alpha, C, beta, aux, cbias, rbias):
F = alpha * accum + (beta * C + aux)
F_row_max = max(F, dim=[0, 1])
E = relu(F + 1) + cbias + rbias
E_col_max = max(E, dim=[0, 2])
D = E + F
return D, F, F_row_max, E_col_max
for m, n, k, _ in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (m, n)),
"alpha": 1.0,
"C": self.fake_tensor(self.element, (m, n)),
"beta": 1.0,
"aux": self.fake_tensor(self.element, (m, n)),
"cbias": self.fake_tensor(self.element, (m, 1)),
"rbias": self.fake_tensor(self.element, (n,)),
"D": self.fake_tensor(self.element, (m, n)),
"F": self.fake_tensor(self.element, (m, n)),
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
}
launcher = EVTTestBed(self.element, evt_mixed_dag_no_batch, example_inputs)
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
result_keys = ["D", "F", "F_row_max", "E_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, 1)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,155 @@
################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
"""
Unit test for store nodes in SM90
"""
import logging
import unittest
import cutlass
from cutlass.backend import *
from cutlass.epilogue import *
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
cutlass.set_log_level(logging.WARNING)
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
class TestEVTStoreSM90(EVTTestCaseBase):
def test_aux_store(self):
"""
Returning a tensor with shape [m, n]
"""
def evt_aux_store(accum, alpha, C):
F = alpha * accum
D = F + C
return D, F
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 0.5,
"C": self.fake_tensor(self.element, (l, m, n)),
"F": self.fake_tensor(self.element, (l, m, n)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_col_reduce(self):
"""
Reduction [m, n] -> [m, 1]
"""
def evt_row_reduce(accum, alpha, C):
acc_row_max = max(accum, dim=[2,])
F = alpha * accum
F_row_max = max(F, dim=[0, 2])
D = F + C
return D, F_row_max, acc_row_max
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 2.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"F_row_max": self.fake_tensor(np.float32, (m, 1)),
"acc_row_max": self.fake_tensor(np.float32, (l, m, 1)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_row_reduce, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F_row_max", "acc_row_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_row_reduce(self):
"""
Reduction [m, n] -> [n]
"""
def evt_col_reduce(accum, alpha, C):
acc_col_max = max(accum, dim=[1,])
F = alpha * accum
F_col_max = max(F, dim=[0, 1])
D = F + C
return D, F_col_max, acc_col_max
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 2.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"F_col_max": self.fake_tensor(np.float32, (n,)),
"acc_col_max": self.fake_tensor(np.float32, (l, 1, n)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_col_reduce, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F_col_max", "acc_col_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
def test_scalar_reduce(self):
"""
Reduction [m, n] -> [1,]
"""
def evt_scalar_reduce(accum, alpha, C):
acc_max = max(accum, dim=[1, 2])
F = alpha * accum
F_max = max(F, dim=[0, 1, 2])
D = F + C
return D, F_max, acc_max
for m, n, k, l in self.get_problem_sizes(8):
example_inputs = {
"accum": self.fake_tensor(self.element, (l, m, n)),
"alpha": 2.0,
"C": self.fake_tensor(self.element, (l, m, n)),
"acc_max": self.fake_tensor(np.float32, (l, 1, 1)),
"F_max": self.fake_tensor(np.float32, (1,)),
"D": self.fake_tensor(self.element, (l, m, n)),
}
launcher = EVTTestBed(self.element, evt_scalar_reduce, example_inputs)
input_keys = ["C", "alpha"]
result_keys = ["D", "F_max", "acc_max"]
launcher.verify((m, n, k), input_keys, result_keys, l)
if __name__ == '__main__':
unittest.main()

View File

@ -30,12 +30,14 @@
#
#################################################################################################
import pathlib
import unittest
if __name__ == '__main__':
loader = unittest.TestLoader()
tests = loader.discover('./', 'gemm_*.py')
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
tests = loader.discover(script_dir, 'evt_*.py')
testRunner = unittest.runner.TextTestRunner()
results = testRunner.run(tests)
if not results.wasSuccessful():

View File

@ -0,0 +1,230 @@
################################################################################
#
# Copyright (c) 20123 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
"""
Testbed classes of EVT
"""
import torch
import unittest
import cutlass
from cutlass import Tensor
import cutlass.backend.evt
from cutlass.profiler import CUDAEventProfiler
from cutlass.shape import GemmCoord
from cutlass.utils.datatypes import torch_type
class EVTReferenceModule:
def __init__(self, layout_A, layout_B, layout_C, epilogue_visitor):
self.layout_A = layout_A
self.layout_B = layout_B
self.layout_C = layout_C
self.epilogue_visitor = epilogue_visitor
def run(self, A, B, C, problem_size, alpha, beta, batch=1):
if self.layout_A == cutlass.LayoutType.RowMajor:
A_row = A.view((batch, problem_size.m, problem_size.k))
else:
A_col = A.view((batch, problem_size.k, problem_size.m))
A_row = torch.permute(A_col, (0, 2, 1))
if self.layout_B == cutlass.LayoutType.RowMajor:
B_row = B.view((batch, problem_size.k, problem_size.n))
else:
B_col = B.view((batch, problem_size.n, problem_size.k))
B_row = torch.permute(B_col, (0, 2, 1))
if self.layout_C == cutlass.LayoutType.RowMajor:
C_row = C.view((batch, problem_size.m, problem_size.n))
else:
C_col = C.view((batch, problem_size.n, problem_size.m))
C_row = torch.permute(C_col, (0, 2, 1))
out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
if self.layout_C == cutlass.LayoutType.ColumnMajor:
out = torch.permute(out_row, (0, 2, 1))
else:
out = out_row
return torch.flatten(out)
def __call__(self, A, B, C, problem_size, batch=1, epilogue_args=None):
# Running the mainloop
accum = self.run(
A, B, C, problem_size, 1.0, 0.0, batch=batch
).reshape(batch, problem_size.m, problem_size.n)
# Running the epilogue
epilogue_args["accum"] = accum
references = self.epilogue_visitor(**epilogue_args)
# Return the results
if not isinstance(references, tuple):
references = (references,)
return references
class EVTTestBed:
"""
Epilogue Visitor Testbed
"""
def __init__(self, element, evt_fn, example_inputs, profile=False, **kwargs) -> None:
self.element = element
layout = cutlass.LayoutType.RowMajor
self.example_inputs = example_inputs
# Create the Gemm plan
self.plan = cutlass.op.Gemm(element=element, layout=layout, element_accumulator=torch.float32)
if "tile_description" in kwargs:
self.plan.tile_description = kwargs["tile_description"]
if "swizzling_functor" in kwargs:
self.plan.swizzling_functor = kwargs["swizzling_functor"]
# Compile the epilogue visitor
epilogue_visitor = cutlass.epilogue.trace(evt_fn, example_inputs)
if "epilogue_stages" in kwargs:
epilogue_visitor.epilogue_stages = kwargs["epilogue_stages"]
self.plan.epilogue_visitor = epilogue_visitor
# Reference model
self.reference_fn = EVTReferenceModule(layout, layout, layout, epilogue_visitor)
self.profile = profile
def get_torch_tensor(self, shape, dtype=None, fill=None):
if dtype is None:
dtype = self.element
dtype = torch_type(dtype)
if fill is None:
return torch.ceil(
torch.empty(size=shape, dtype=dtype, device="cuda").uniform_(-4.5, 3.5)
)
else:
return torch.full(shape, fill, dtype=dtype, device="cuda")
def verify(self, problem_size, input_keys, result_keys, batch_count=1):
"""
Verify the results
"""
problem_size = GemmCoord(*problem_size)
# Initiate the GEMM arguments
tensor_A = self.get_torch_tensor((batch_count, problem_size.m, problem_size.k))
tensor_B = self.get_torch_tensor((batch_count, problem_size.k, problem_size.n))
# Initialize the epilogue args
epilogue_args = {}
for key in self.example_inputs.keys():
if key in input_keys:
tensor = self.example_inputs[key]
if isinstance(tensor, Tensor):
epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element)
else:
epilogue_args[key] = tensor
elif key in result_keys:
tensor = self.example_inputs[key]
if isinstance(tensor, Tensor):
if "max" in key:
fill = -1000
else:
fill = 0
epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element, fill=fill)
else:
epilogue_args[key] = tensor
tensor_D = epilogue_args["D"]
if "C" in epilogue_args:
tensor_C = epilogue_args["C"]
else:
tensor_C = tensor_D
# Run the device kernel
self.plan.run(tensor_A, tensor_B, tensor_C, tensor_D, visitor_args=epilogue_args)
# Run the host reference
evt_args_inputs = {}
for key in input_keys:
evt_args_inputs[key] = epilogue_args[key]
reference_results = self.reference_fn(
tensor_A, tensor_B, tensor_C, problem_size, batch_count, evt_args_inputs)
# Compare the results
for result, ref in zip(result_keys, reference_results):
assert torch.equal(epilogue_args[result].flatten(), ref.flatten())
# Run profile
if self.profile:
profiler = CUDAEventProfiler(
self.plan, 100, 100, tensor_A, tensor_B, tensor_C, tensor_D,
visitor_args = epilogue_args
)
print(f"Cutlass Python Duration: {profiler()}")
class EVTTestCaseBase(unittest.TestCase):
"""
Base class for EVT Unittest
"""
def __init__(self, methodName: str = "runTest", lmnk=(6, 512, 256, 128)) -> None:
super().__init__(methodName)
self.element = cutlass.DataType.f16
self.l, self.m, self.n, self.k = lmnk
self.problem_size = (self.m, self.n, self.k)
torch.random.manual_seed(42)
def fake_tensor(self, element, shape):
return Tensor(element=element, shape=shape, layout_tag=cutlass.LayoutType.RowMajor)
def get_problem_sizes(self, alignment, k=None, batch_count=[3,]):
k = k if k else self.k
problem_size_m = [alignment, 512 - 3 * alignment]
problem_size_n = [alignment, 512 - alignment]
if alignment % 8 == 0:
problem_size_m.append(768)
problem_size_n.append(768)
problem_size_l = batch_count
problem_sizes = []
for m in problem_size_m:
for n in problem_size_n:
for l in problem_size_l:
problem_sizes.append((m, n, k, l))
return problem_sizes

View File

@ -35,15 +35,15 @@ High-level tests for running batched GEMMs
"""
from functools import partial
from math import prod
import cutlass
import logging
import torch
from math import prod
import unittest
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
import cutlass
from cutlass.backend.utils.device import device_cc
import torch
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
@ -130,10 +130,5 @@ class GemmF16Batched(unittest.TestCase):
self.run_batched((3,), False, True, False)
self.run_batched((2, 3), False, True, False)
def test_batched_C(self):
self.run_batched((3,), False, False, True)
self.run_batched((2, 3), False, False, True)
if __name__ == '__main__':
unittest.main()

View File

@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F16 operands on SM80
"""
from functools import partial
import cutlass
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 80

View File

@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F16 operands on SM90
"""
from functools import partial
import cutlass
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 90

View File

@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F32 operands on SM80
"""
from functools import partial
import cutlass
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 80

View File

@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F64 operands on SM80
"""
from functools import partial
import cutlass
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 80

View File

@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F64 operands on SM90
"""
from functools import partial
import cutlass
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 90

View File

@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with S8 operands on SM80
"""
from functools import partial
import cutlass
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 80

View File

@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with S8 operands on SM90
"""
from functools import partial
import cutlass
import logging
import unittest
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
import cutlass
from cutlass.backend.utils.device import device_cc
from utils import LayoutCombination, add_test_gemm
cutlass.set_log_level(logging.WARNING)
cc = 90

View File

@ -0,0 +1,387 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from math import prod
import os
import re
import subprocess
import torch
from cutlass import (
DataType,
DataTypeSize,
GemmUniversalMode,
LayoutType,
OpcodeClass,
ShortDataTypeNames,
SwizzlingFunctor
)
from cutlass.backend import compiler
from cutlass.backend.gemm_operation import GemmArguments, GemmOperationUniversal
from cutlass.backend.memory_manager import get_allocated_size
from cutlass.backend.reduction_operation import ReductionArguments, ReductionOperation
from cutlass.shape import GemmCoord, MatrixCoord
from cutlass.utils.datatypes import torch_type
class GemmUniversalLauncher:
def __init__(
self,
operation,
seed=2080,
verification=True,
iterations=500,
compiler_mode= "nvcc",
**kwargs,
) -> None:
# Create the reduction kernel, if needed
self.reduction_operation: ReductionOperation = ReductionOperation(
shape=MatrixCoord(4, 32 * operation.C.alignment),
C=operation.C,
element_accumulator=operation.tile_description.math_instruction.element_accumulator,
element_compute=operation.epilogue_functor.element_epilogue,
epilogue_functor=operation.epilogue_functor,
count=operation.C.alignment,
)
self.math_operation = operation.tile_description.math_instruction.math_operation
self.verification = verification
if compiler_mode == "nvcc":
compiler.nvcc()
elif compiler_mode == "nvrtc":
compiler.nvrtc()
else:
raise Exception(f"Unexpected compiler string {compiler_mode}")
op_list = [operation]
if operation.arch < 90:
# Split K via Python is currently only supported for pre-SM90 kernels
op_list.append(self.reduction_operation)
compiler.add_module(op_list, bypass_cache=False)
self.operation = operation
self.dtype_A = torch_type(operation.A.element)
self.dtype_B = torch_type(operation.B.element)
self.dtype_C = torch_type(operation.C.element)
self.dtype_D = torch_type(operation.C.element)
accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
element_size = DataTypeSize[operation.A.element]
if element_size == 1:
self.rand_max = 1
self.rand_min = 0
elif element_size <= 8:
self.rand_max = 1
self.rand_min = -1
elif element_size == 16:
self.rand_max = 4
self.rand_min = -4
else:
self.rand_max = 8
self.rand_min = -8
self.seed = seed
self.compute_type = operation.epilogue_functor.element_epilogue
self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
def print_problem_size(self, p, mode, batch_count):
if mode == GemmUniversalMode.Gemm:
mode = "Gemm"
elif mode == GemmUniversalMode.Batched:
mode = "GemmBatched"
elif mode == GemmUniversalMode.GemmSplitKParallel:
mode = "GemmSplitKParallel"
print(f"problem: {p.m}, {p.n}, {p.k}\n batch_count: {batch_count}\n mode: {mode}")
def uniform_init(self, shape, dtype, layout):
size = prod(shape)
if dtype.is_floating_point:
data = torch.ceil(torch.empty(size=(size,), dtype=dtype, device="cuda").uniform_(self.rand_min - 0.5, self.rand_max - 0.5))
else:
# PyTorch does not currently support integer-typed matrix multiplications on GPU.
# Fall back to CPU for integer type references.
data = torch.empty(size=(size,), dtype=dtype, device="cpu").random_(self.rand_min, self.rand_max + 1)
if dtype == torch.float64 or dtype == torch.float32:
data = data.to("cpu")
data_ref = data.reshape(shape)
if layout == LayoutType.RowMajor:
data_cutlass = data_ref
else:
data_cutlass = data_ref.transpose(-1, -2).contiguous()
data_cutlass = data_cutlass.to("cuda")
return data_cutlass, data_ref
def reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
# If any tensor is on CPU, place all tensors on CPU unless only
# tensor C is on CPU
devices = [x.device.type for x in [tensor_A, tensor_B, tensor_C]]
if "cpu" in devices and devices != ["cuda", "cuda", "cpu"]:
device = torch.device("cpu")
else:
device = tensor_A.device
tensor_A = tensor_A.to(device)
tensor_B = tensor_B.to(device)
tensor_C = tensor_C.to(device)
dtype = torch_type(self.compute_type)
alpha_torch = torch.tensor([alpha], device=device).to(dtype)
beta_torch = torch.tensor([beta], device=device).to(dtype)
tmp = tensor_A @ tensor_B
tensor_D_ref = (alpha_torch * tmp) + (tensor_C * beta_torch)
return tensor_D_ref.to(self.dtype_D)
def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
torch.random.manual_seed(self.seed)
# Assign an actual batch count in cases where we are not running in batched mode.
# This is to differentiate between the number of split K slices and the batch count,
# which are overloaded within the single `batch_count` variable.
if mode == GemmUniversalMode.Batched:
true_batch_count = batch_count
else:
true_batch_count = 1
def transpose(layout):
if layout == LayoutType.RowMajor:
return LayoutType.ColumnMajor
else:
return LayoutType.RowMajor
tensor_A, tensor_A_ref = self.uniform_init(
(true_batch_count, problem_size.m, problem_size.k),
self.dtype_A,
self.operation.A.layout if not self.operation.switched else transpose(self.operation.B.layout),
)
tensor_B, tensor_B_ref = self.uniform_init(
(true_batch_count, problem_size.k, problem_size.n),
self.dtype_B,
self.operation.B.layout if not self.operation.switched else transpose(self.operation.A.layout),
)
tensor_C, tensor_C_ref = self.uniform_init(
(true_batch_count, problem_size.m, problem_size.n),
self.dtype_C,
self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
)
tensor_D = torch.zeros_like(tensor_C)
if self.compute_type in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
alpha = int(alpha)
beta = int(beta)
#
# Launch kernel
#
arguments = GemmArguments(
operation=self.operation,
problem_size=problem_size,
A=tensor_A,
B=tensor_B,
C=tensor_C,
D=tensor_D,
output_op=self.operation.epilogue_type(alpha, beta),
gemm_mode=mode,
split_k_slices=split_k_slices,
batch=batch_count,
)
if mode == GemmUniversalMode.GemmSplitKParallel:
reduction_arguments = ReductionArguments(
self.reduction_operation,
problem_size=[problem_size.m, problem_size.n],
partitions=split_k_slices,
workspace=arguments.ptr_D,
destination=tensor_D,
source=tensor_C,
output_op=self.reduction_operation.epilogue_type(alpha, beta),
)
self.operation.run(arguments)
if mode == GemmUniversalMode.GemmSplitKParallel:
self.reduction_operation.run(reduction_arguments)
passed = True
if self.verification:
if mode == GemmUniversalMode.GemmSplitKParallel:
reduction_arguments.sync()
else:
arguments.sync()
tensor_D_ref = self.reference(
problem_size,
tensor_A_ref,
tensor_B_ref,
tensor_C_ref,
alpha,
beta,
)
tensor_D_ref = tensor_D_ref.to('cuda')
if self.operation.switched or self.operation.C.layout == LayoutType.ColumnMajor:
tensor_D = tensor_D.transpose(-1, -2).contiguous()
passed = tensor_D.equal(tensor_D_ref)
try:
assert passed
except AssertionError:
self.print_problem_size(problem_size, mode, batch_count)
del arguments
if mode == GemmUniversalMode.GemmSplitKParallel:
del reduction_arguments
cur_size = get_allocated_size()
assert cur_size == 0, f"{cur_size} B of memory were not released after this run"
return passed
def test_all_gemm(operation: "GemmOperationUniversal", testcase="universal", compilation_mode="nvcc"):
passed = True
minimum_operand_element_size = min(
DataTypeSize[operation.A.element], DataTypeSize[operation.B.element]
)
opcode_class = operation.tile_description.math_instruction.opcode_class
if opcode_class == OpcodeClass.Simt:
alignment = 1
else:
alignment = 128 // minimum_operand_element_size
alignment_m = alignment
alignment_n = alignment
alignment_k = alignment
# INT8 alignment constraints
if opcode_class == OpcodeClass.Simt:
A_is_s8 = operation.A.element == DataType.s8
B_is_s8 = operation.B.element == DataType.s8
if A_is_s8 and operation.A.layout == LayoutType.ColumnMajor:
alignment_m = 4
if B_is_s8 == DataType.s8 and operation.A.layout == LayoutType.RowMajor:
alignment_n = 4
if A_is_s8 and B_is_s8 and (operation.A.layout == LayoutType.RowMajor or operation.B.layout == LayoutType.ColumnMajor):
alignment_k = 4
threadblock_k = operation.tile_description.threadblock_shape[2]
assert testcase != "interleaved"
supports_split_k = operation.arch < 90 and not operation.swizzling_functor == SwizzlingFunctor.StreamK
if testcase == "multistage":
modes = [GemmUniversalMode.Gemm]
problem_size_m = [16, 528]
problem_size_n = [16, 528]
problem_size_k = [
threadblock_k,
threadblock_k * operation.tile_description.stages
+ operation.tile_description.math_instruction.instruction_shape[2],
]
problem_alpha = [1.0]
problem_beta = [0.0]
batch_counts = [1]
else:
modes = [GemmUniversalMode.Gemm]
batch_counts = [1, 2, 3, 5, 7]
if supports_split_k:
modes.append(GemmUniversalMode.GemmSplitKParallel)
problem_size_m = [alignment_m, 512 - 3 * alignment_m]
problem_size_n = [alignment_n, 512 - 2 * alignment_n]
if operation.tile_description.stages is None:
stages_for_k_calc = 7
else:
stages_for_k_calc = operation.tile_description.stages
problem_size_k = [
alignment_k,
threadblock_k * stages_for_k_calc - alignment_k,
threadblock_k * stages_for_k_calc * 3 - alignment_k,
]
problem_alpha = [1.0]
problem_beta = [2.0]
testbed = GemmUniversalLauncher(operation, compiler_mode=compilation_mode)
for mode in modes:
for m in problem_size_m:
for n in problem_size_n:
for k in problem_size_k:
for batch_count in batch_counts:
for alpha in problem_alpha:
for beta in problem_beta:
# skip very small K problems
if testcase == "universal":
if k // batch_count < 2 * threadblock_k:
continue
problem_size = GemmCoord(m, n, k)
if supports_split_k:
split_k_slices = batch_count
else:
split_k_slices = 1
overridden_mode = mode
if mode == GemmUniversalMode.Gemm and batch_count > 1:
overridden_mode = GemmUniversalMode.Batched
passed = testbed.run(
overridden_mode,
problem_size,
batch_count,
split_k_slices,
alpha,
beta,
)
if not passed:
return False
return passed

View File

@ -30,12 +30,14 @@
#
#################################################################################################
import pathlib
import unittest
if __name__ == '__main__':
loader = unittest.TestLoader()
tests = loader.discover('./', 'conv2d_*.py')
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
tests = loader.discover(script_dir, 'gemm_*.py')
testRunner = unittest.runner.TextTestRunner()
results = testRunner.run(tests)
if not results.wasSuccessful():

View File

@ -0,0 +1,239 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import cutlass
from cutlass import (
DataTypeNames,
EpilogueScheduleSuffixes,
KernelScheduleSuffixes,
LayoutType,
OpcodeClassNames,
ShortDataTypeNames,
ShortLayoutTypeNames
)
from cutlass.backend import library
from cutlass.backend.utils.software import SubstituteTemplate
from gemm_testbed import test_all_gemm
class Layout:
"""
Utility class to map transpose and non-transpose terminology to row- and column-major terminology
"""
T = LayoutType.RowMajor
N = LayoutType.ColumnMajor
class LayoutCombination:
"""
Utility class defining all combinations of row- and column-major layouts for operands to a GEMMs
"""
NNN = (Layout.N, Layout.N, Layout.N)
NNT = (Layout.N, Layout.N, Layout.T)
NTN = (Layout.N, Layout.T, Layout.N)
NTT = (Layout.N, Layout.T, Layout.T)
TNN = (Layout.T, Layout.N, Layout.N)
TNT = (Layout.T, Layout.N, Layout.T)
TTN = (Layout.T, Layout.T, Layout.N)
TTT = (Layout.T, Layout.T, Layout.T)
def get_name(
layouts,
alignments,
element_output,
element_accumulator,
element_epilogue,
cluster_shape,
threadblock_shape,
stages,
element_a,
element_b,
arch,
opclass,
kernel_schedule=None,
epilogue_schedule=None,
suffix="",
):
"""
Generates a procedural name for a test case.
:param layouts: indexable container of layouts of A, B, and C operands
:param alignments: indexable container of alignments of A, B, and C operands
:param element_output: data type of the output element
:param element_accumulator: data type used in accumulation
:param element_epilogue: data type used in computing the epilogue
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
:param threadblock_shape: indexable container of dimensions of threadblock tiles
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param element_a: data type of operand A
:param element_b: data type of operand B
:param arch: compute capability of kernel being generated
:type arch: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpcodeClass
:param kernel_schedule: kernel_schedule type
:type kernel_schedule: cutlass.KernelScheduleType
:param epilogue_schedule: epilogue_schedule type
:type epilogue_schedule: cutlass.EpilogueScheduleType
:param suffix: additional string to add to the suffix of the name
:type suffix: str
:return: str
"""
name_format = "test_SM${arch}_Device_Gemm_${eA}${lA}_${eB}${lB}_${eC}${lC}_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${cM}x${cN}x${cK}_${stages}_align${aA}-${aB}-${aC}${k}${e}${suffix}"
return SubstituteTemplate(
name_format,
{
"arch": str(arch),
"eA": DataTypeNames[element_a],
"eB": DataTypeNames[element_b],
"eC": DataTypeNames[element_output],
"lA": ShortLayoutTypeNames[layouts[0]],
"lB": ShortLayoutTypeNames[layouts[1]],
"lC": ShortLayoutTypeNames[layouts[2]],
"opclass": OpcodeClassNames[opclass],
"acc": DataTypeNames[element_accumulator],
"cM": str(cluster_shape[0]),
"cN": str(cluster_shape[1]),
"cK": str(cluster_shape[2]),
"tbM": str(threadblock_shape[0]),
"tbN": str(threadblock_shape[1]),
"tbK": str(threadblock_shape[2]),
"stages": str(stages) if stages is not None else "auto",
"aA": str(alignments[0]),
"aB": str(alignments[1]),
"aC": str(alignments[2]),
"k": "" if kernel_schedule is None else KernelScheduleSuffixes[kernel_schedule],
"e": "" if epilogue_schedule is None else EpilogueScheduleSuffixes[epilogue_schedule],
"suffix": "" if suffix is None else suffix,
},
)
def add_test_gemm(
cls=None,
cc=None,
element=None,
layouts=None,
alignments=None,
element_output=None,
element_accumulator=None,
cluster_shape=None,
threadblock_shape=None,
warp_count=None,
stages=None,
opclass=None,
swizzle=None,
kernel_schedule=None,
epilogue_schedule=None,
compilation_modes=['nvcc', 'nvrtc']):
"""
Create test-running functions with the given specification and set it as a method of ``cls``.
:param cls: class to which the generated method will be added
:type cls: type
:param cc: compute capability to compile for
:type cc: int
:param element: data type of A and B operands
:type element: cutlass.DataType.f16
:param layouts: layouts of A, B, and C operands
:type layouts: list or tuple
:param alignments: alingments of A, B, and C operands
:type alignments: list or tuple
:param element_output: data type of the output element
:type element_output: cutlass.DataType
:param element_accumulator: data type used in accumulation
:type element_accumulator: cutlass.DataType
:param cluster_shape: dimensions of clusters
:type cluster_shape: list or tuple
:param threadblock_shape: dimensions of threadblock tiles
:type threadblock_shape: list or tuple
:param warp_count: warps to be launched per threadblock dimension
:type warp_count: list or tuple
:param stages: number of pipeline stages to use in the kernel
:type stages: int
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
:type opclass: cutlass.OpcodeClass
:param swizzle: threadblock swizzling functor
:param kernel_schedule: kernel schedule to use
:type kernel_schedule: cutlass.KernelScheduleType
:param epilogue_schedule: epilogue schedule to use
:type epilogue_schedule: cutlass.EpilogueScheduleType
:param compilation_modes: list of compilers to used in testing the kernel (options: 'nvrtc', 'nvcc')
:type compilation_modes: list
"""
for compilation_mode in compilation_modes:
def run(self):
"""
Dynamically-generated function that constructs a GEMM operation and verifies it against
multiple test cases.
"""
element_A = element
element_B = element
layout_A, layout_B, layout_C = layouts
alignment_A, alignment_B, alignment_C = alignments
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
element_C=element_output, element_D=element_output,
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
element_accumulator=element_accumulator,
kernel_cc=cc)
plan.opclass = opclass
if swizzle is not None:
plan.swizzling_functor = swizzle
td = plan.tile_descriptions()[0]
if warp_count is not None:
td.warp_count = warp_count
td.threadblock_shape = threadblock_shape
td.stages = stages
td.cluster_shape = cluster_shape
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
self.assertTrue(test_all_gemm(op, 'universal', compilation_mode=compilation_mode))
element_epilogue = element_accumulator
name = get_name(
layouts=layouts, alignments=alignments, element_output=element_output, element_accumulator=element_accumulator,
element_epilogue=element_epilogue, cluster_shape=cluster_shape, threadblock_shape=threadblock_shape,
stages=stages, element_a=element, element_b=element, arch=cc, opclass=opclass,
kernel_schedule=kernel_schedule, epilogue_schedule=epilogue_schedule, suffix=f'_{compilation_mode}')
setattr(cls, name, run)

View File

@ -38,7 +38,6 @@ from math import ceil
import unittest
import cutlass
import cutlass_bindings
import cutlass.utils.datatypes as datatypes
from cutlass.backend.utils.device import device_cc
from utils import ExpectException

View File

@ -0,0 +1,245 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Test the EVT interface
"""
import numpy as np
import unittest
import cutlass
from cutlass import LayoutType, Tensor
from cutlass.backend.utils.device import device_cc
from cutlass.epilogue import reshape, permute
from utils import ExpectException
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
class EVTErrorTests(unittest.TestCase):
"""
Tests various error scenarios that arise with the EVT interface
"""
@unittest.skipIf(device_cc() != 90, "Only Sm90 EVT requires root node be 'D'")
def test_root_not_d(self):
"""
Test when "D" does not exist in Sm90 EVT
"""
def evt_root_not_d(accum, alpha):
F = accum * alpha
return F
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"alpha": 1.2,
"F": self.fake_tensor(np.float16, (6, 512, 512))
}
with ExpectException(device_cc() == 90,
"SyntaxError: Sm90 EVT requires the epilogue to have a returned tensor D, "
"but the variable 'D' is not found in the return values.", True):
cutlass.epilogue.trace(evt_root_not_d, example_tensors)
def test_no_accum(self):
"""
Test when "accum" is not in input arguments
"""
def evt_no_accum(alpha, C):
D = alpha * C
return D
example_tensors = {
"C": self.fake_tensor(np.float16, (6, 512, 512)),
"alpha": 1.2,
"D": self.fake_tensor(np.float16, (6, 512, 512))
}
with ExpectException(True, "SyntaxError: Cannot find 'accum' in the argument list.", True):
cutlass.epilogue.trace(evt_no_accum, example_tensors)
@unittest.skipIf(device_cc() != 90, "Only Sm90 EVT has concern on smem size")
def test_too_much_shared_memory(self):
"""
Test when the epilogue consumes too much shared memory
"""
def evt_too_much_shared_memory(accum, C1, C2, C3, C4, C5):
D1 = accum + C1
D2 = D1 + C2
D3 = D2 + C3
D4 = D3 + C4
D = D4 + C5
return D, D1, D2, D3, D4
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"C1": self.fake_tensor(np.float16, (6, 512, 512)),
"C2": self.fake_tensor(np.float16, (6, 512, 512)),
"C3": self.fake_tensor(np.float16, (6, 512, 512)),
"C4": self.fake_tensor(np.float16, (6, 512, 512)),
"C5": self.fake_tensor(np.float16, (6, 512, 512)),
"D1": self.fake_tensor(np.float16, (6, 512, 512)),
"D2": self.fake_tensor(np.float16, (6, 512, 512)),
"D3": self.fake_tensor(np.float16, (6, 512, 512)),
"D4": self.fake_tensor(np.float16, (6, 512, 512)),
"D": self.fake_tensor(np.float16, (6, 512, 512))
}
epilogue_visitor = cutlass.epilogue.trace(evt_too_much_shared_memory, example_tensors)
plan = cutlass.op.Gemm(
element=np.float16, layout=cutlass.LayoutType.RowMajor,
element_accumulator=np.float32
)
with ExpectException(True,
"RuntimeError: The epilogue consumes too much shared memory. "
"No valid tile description is found in the generator.", True):
plan.epilogue_visitor = epilogue_visitor
def test_not_ssa(self):
"""
Test when the epilogue is not in SSA
"""
def evt_redefine(accum, C, alpha):
F = accum + C
F = F * alpha
D = F
return D, F
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"C": self.fake_tensor(np.float16, (6, 512, 512)),
"alpha": 1.5,
"D": self.fake_tensor(np.float16, (6, 512, 512)),
"F": self.fake_tensor(np.float16, (6, 512, 512))
}
with ExpectException(True, "SyntaxError: Variable 'F' cannot be defined twice.", True):
cutlass.epilogue.trace(evt_redefine, example_tensors)
def evt_undefine(accum, alpha):
F = accum + C
D = F * alpha
return D, F
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"alpha": 1.5,
"D": self.fake_tensor(np.float16, (6, 512, 512)),
"F": self.fake_tensor(np.float16, (6, 512, 512))
}
with ExpectException(True, "SyntaxError: Variable 'C' is undefined.", True):
cutlass.epilogue.trace(evt_undefine, example_tensors)
def test_missing_example_tensor(self):
"""
Test when the example tensor of an input/output variable is not provided
"""
def evt_missing_example_tensor(accum, C):
D = accum + C
return D
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"C": self.fake_tensor(np.float16, (6, 512, 512)),
}
with ExpectException(True, "RuntimeError: Example input for D is not provided.", True):
cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"D": self.fake_tensor(np.float16, (6, 512, 512)),
}
with ExpectException(True, "RuntimeError: Example input for C is not provided.", True):
cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
def test_return_expression(self):
"""
Test when the return value is an expression
"""
def evt_return_expr(accum, C):
return accum + C
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
"C": self.fake_tensor(np.float16, (6, 512, 512)),
}
with ExpectException(True, "SyntaxError: Return value cannot be an expression", True):
cutlass.epilogue.trace(evt_return_expr, example_tensors)
def test_incompatible_shape(self):
"""
Test when the shape of example tensors are incompatible
"""
def evt_incompatible_shape(accum, C):
D = accum + C
return D
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 256, 512)),
"C": self.fake_tensor(np.float16, (6, 512, 512)),
"D": self.fake_tensor(np.float16, (6, 512, 512))
}
with ExpectException(True,
"RuntimeError: Dimension mismatch between accum(6, 256, 512), C(6, 512, 512).", True):
cutlass.epilogue.trace(evt_incompatible_shape, example_tensors)
def test_no_matching_impl(self):
def evt_no_matching_impl(accum, bias):
D = accum + reshape(permute(bias, indices=(1, 0)), new_shape=(512, 1))
return D
example_tensors = {
"accum": self.fake_tensor(np.float16, (6, 512, 256)),
"bias": self.fake_tensor(np.float16, (16, 32)),
"D": self.fake_tensor(np.float16, (6, 512, 256))
}
with ExpectException(True, "NotImplementedError: No matching op for node bias with stride (0, (1, 32), 0).", True):
cutlass.epilogue.trace(evt_no_matching_impl, example_tensors)
#
# Helper functions
#
def fake_tensor(self, element, shape):
return Tensor(element=element, shape=shape, layout_tag=LayoutType.RowMajor)
if __name__ == '__main__':
unittest.main()

View File

@ -38,7 +38,6 @@ from math import ceil
import unittest
import cutlass
import cutlass_bindings
import cutlass.utils.datatypes as datatypes
from cutlass.backend.utils.device import device_cc
from utils import ExpectException
@ -262,13 +261,13 @@ class GemmErrorTests(unittest.TestCase):
# Ensure that all tile descriptions have opclass of TensorOp
for td in plan.tile_descriptions():
assert td.math_instruction.opcode_class == cutlass_bindings.OpClass.TensorOp
assert td.math_instruction.opcode_class == cutlass.OpcodeClass.TensorOp
plan.opclass = cutlass.OpcodeClass.Simt
# Ensure that all tile descriptions have opclass of Simt
for td in plan.tile_descriptions():
assert td.math_instruction.opcode_class == cutlass_bindings.OpClass.Simt
assert td.math_instruction.opcode_class == cutlass.OpcodeClass.Simt
def test_invalid_tile_description(self):
"""

View File

@ -50,9 +50,10 @@ class ExpectException:
:param message: message to print if an exception is raised when not expected or vice versa
:type message: str
"""
def __init__(self, exception_expected: bool, message: str = ''):
def __init__(self, exception_expected: bool, message: str = '', verify_msg=False):
self.exception_expected = exception_expected
self.message = message
self.verify_msg = verify_msg
def __enter__(self):
return self
@ -60,6 +61,9 @@ class ExpectException:
def __exit__(self, exc_type, exc_val, traceback):
exception_raised = exc_type is not None
assert self.exception_expected == exception_raised, self.message
if self.verify_msg:
exc_message = f"{exc_type.__name__}: {exc_val}"
assert exc_message == self.message, f"expect error message {self.message}, got {exc_message}"
# Suppress the exception
return True

View File

@ -0,0 +1,75 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Utility script for discovering and running all PyCuTe tests
"""
import argparse
import logging
import pathlib
import unittest
def numeric_log_level(log_level: str) -> int:
"""
Converts the string identifier of the log level into the numeric identifier used
in setting the log level
:param x: string representation of log level (e.g., 'INFO', 'DEBUG')
:type x: str
:return: numeric representation of log level
:rtype: int
"""
numeric_level = getattr(logging, log_level.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError(f"Invalid log level: {log_level}")
return numeric_level
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
help='Logging level to be used by the generator script')
args = parser.parse_args()
# Set the logging level based on the user-provided `--log-level` command-line option
logging.basicConfig(level=args.log_level)
loader = unittest.TestLoader()
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
tests = loader.discover(script_dir, "test_*.py")
test_runner = unittest.runner.TextTestRunner()
results = test_runner.run(tests)
if not results.wasSuccessful():
raise Exception("Test cases failed")

View File

@ -0,0 +1,95 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Unit tests for pycute.coalesce
"""
import logging
import unittest
from pycute import *
_LOGGER = logging.getLogger(__name__)
class TestCoalesce(unittest.TestCase):
def helper_test_coalesce(self, layout):
layoutR = coalesce(layout)
_LOGGER.debug(f"{layout} => {layoutR}")
self.assertEqual(size(layoutR), size(layout))
for i in range(size(layout)):
self.assertEqual(layoutR(i), layout(i))
def test_coalesce(self):
layout = Layout(1,0)
self.helper_test_coalesce(layout)
layout = Layout(1,1)
self.helper_test_coalesce(layout)
layout = Layout((2,4))
self.helper_test_coalesce(layout)
layout = Layout((2,4,6))
self.helper_test_coalesce(layout)
layout = Layout((2,4,6), (1,6,2))
self.helper_test_coalesce(layout)
layout = Layout((2,1,6), (1,7,2))
self.helper_test_coalesce(layout)
layout = Layout((2,1,6), (4,7,8))
self.helper_test_coalesce(layout)
layout = Layout((2,(4,6)))
self.helper_test_coalesce(layout)
layout = Layout((2,4), (4,1))
self.helper_test_coalesce(layout)
layout = Layout((2,4,6), (24,6,1))
self.helper_test_coalesce(layout)
layout = Layout((2,1,3), (2,4,4))
self.helper_test_coalesce(layout)
layout = Layout(((2,2),(2,2)), ((1,4),(8,32)))
self.helper_test_coalesce(layout)
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,92 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Unit tests for pycute.complement
"""
import logging
import unittest
from pycute import *
_LOGGER = logging.getLogger(__name__)
class TestComplement(unittest.TestCase):
def helper_test_complement(self, layout):
layoutR = complement(layout)
_LOGGER.debug(f"{layout} => {layoutR}")
# Post-condition: test disjointness of the codomains
for a in range(size(layout)):
for b in range(size(layoutR)):
assert (layout(a) != layoutR(b)) or (layout(a) == 0 and layoutR(b) == 0)
def test_complement(self):
test = Layout(1,0)
self.helper_test_complement(test)
test = Layout(1,1)
self.helper_test_complement(test)
test = Layout(4,0)
self.helper_test_complement(test)
test = Layout((2,4),(1,2))
self.helper_test_complement(test)
test = Layout((2,3),(1,2))
self.helper_test_complement(test)
test = Layout((2,4),(1,4))
self.helper_test_complement(test)
test = Layout((2,4,8),(8,1,64))
self.helper_test_complement(test)
test = Layout(((2,2),(2,2)),((1,4),(8,32)))
self.helper_test_complement(test)
test = Layout((2,(3,4)),(3,(1,6)))
self.helper_test_complement(test)
test = Layout((4,6),(1,6))
self.helper_test_complement(test)
test = Layout((4,10),(1,10))
self.helper_test_complement(test)
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,204 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Unit tests for pycute.composition
"""
import logging
import unittest
from pycute import *
_LOGGER = logging.getLogger(__name__)
class TestComposition(unittest.TestCase):
def helper_test_composition(self, layoutA, layoutB):
layoutR = composition(layoutA, layoutB)
_LOGGER.debug(f"{layoutA} o {layoutB} => {layoutR}")
# True post-condition: Every coordinate c of layoutB with L1D(c) < size(layoutR) is a coordinate of layoutR.
# Test that R(c) = A(B(c)) for all coordinates c in layoutR
for i in range(size(layoutR)):
self.assertEqual(layoutR(i), layoutA(layoutB(i)))
def test_composition(self):
layoutA = Layout(1,0)
layoutB = Layout(1,0)
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout(1,0)
layoutB = Layout(1,1)
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout(1,1)
layoutB = Layout(1,0)
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout(1,1)
layoutB = Layout(1,1)
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4))
layoutB = Layout((4))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4), (2))
layoutB = Layout((4))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4))
layoutB = Layout((4), (2))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4), (0))
layoutB = Layout((4))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4))
layoutB = Layout((4), (0))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((1), (0))
layoutB = Layout((4))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4))
layoutB = Layout((1), (0))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4))
layoutB = Layout((2))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4), (2))
layoutB = Layout((2))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4))
layoutB = Layout((2), (2))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4), (2))
layoutB = Layout((2), (2))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((12))
layoutB = Layout((4,3))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((12), (2))
layoutB = Layout((4,3))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((12))
layoutB = Layout((4,3), (3,1))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((12), (2))
layoutB = Layout((4,3), (3,1))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((12))
layoutB = Layout((2,3), (2,4))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4,3))
layoutB = Layout((4,3))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4,3))
layoutB = Layout((12))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4,3))
layoutB = Layout((6), (2))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4,3))
layoutB = Layout((6,2), (2,1))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4,3), (3,1))
layoutB = Layout((4,3))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4,3), (3,1))
layoutB = Layout((12))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4,3), (3,1))
layoutB = Layout((6), (2))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4,3), (3,1))
layoutB = Layout((6,2), (2,1))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((8,8))
layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((8,8), (8,1))
layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
layoutB = Layout(8, 4)
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout(((4,2)), ((1,16)))
layoutB = Layout((4,2), (2,1))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((2,2), (2,1))
layoutB = Layout((2,2), (2,1))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4,8,2))
layoutB = Layout((2,2,2), (2,8,1))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4,8,2), (2,8,1))
layoutB = Layout((2,2,2), (1,8,2))
self.helper_test_composition(layoutA, layoutB)
layoutA = Layout((4,8,2), (2,8,1))
layoutB = Layout((4,2,2), (2,8,1))
self.helper_test_composition(layoutA, layoutB)
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,80 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Unit tests for pycute.int_tuple
"""
import unittest
from pycute import *
class TestIntTuple(unittest.TestCase):
def test_product(self):
self.assertEqual(product(2), 2)
self.assertEqual(product((3,2)), 6)
self.assertEqual(product(product(((2,3),4))), 24)
def test_inner_product(self):
self.assertEqual(inner_product(2, 3), 6)
self.assertEqual(inner_product((1,2), (3,2)), 7)
self.assertEqual(inner_product(((2,3),4), ((2,1),2)), 15)
def test_shape_div(self):
self.assertEqual(shape_div((3,4), 6), (1,2))
self.assertEqual(shape_div((3,4), 12), (1,1))
self.assertEqual(shape_div((3,4), 36), (1,1))
self.assertEqual(shape_div(((3,4),6), 36), ((1,1),2))
self.assertEqual(shape_div((6,(3,4)), 36), (1,(1,2)))
def test_prefix_product(self):
self.assertEqual(prefix_product(2), 1)
self.assertEqual(prefix_product((3,2)), (1,3))
self.assertEqual(prefix_product((3,2,4)), (1,3,6))
self.assertEqual(prefix_product(((2,3),4)), ((1,2),6))
self.assertEqual(prefix_product(((2,3),(2, 1, 2),( 5, 2, 1))),
((1,2),(6,12,12),(24,120,240)))

View File

@ -0,0 +1,87 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Unit tests for pycute.left_inverse
"""
import logging
import unittest
from pycute import *
_LOGGER = logging.getLogger(__name__)
class TestLeftInverse(unittest.TestCase):
def helper_test_left_inverse(self, layout):
inv_layout = left_inverse(layout)
_LOGGER.debug(f"{layout} => {inv_layout}")
for i in range(size(layout)):
self.assertEqual(inv_layout(layout(i)), i)
def test_left_inverse(self):
test = Layout(1,0)
self.helper_test_left_inverse(test)
test = Layout((1,1),(0,0))
self.helper_test_left_inverse(test)
test = Layout(1,1)
self.helper_test_left_inverse(test)
test = Layout(4,1)
self.helper_test_left_inverse(test)
test = Layout(4,2)
self.helper_test_left_inverse(test)
test = Layout((8,4),(1,8))
self.helper_test_left_inverse(test)
test = Layout((8,4),(4,1))
self.helper_test_left_inverse(test)
test = Layout((2,4,6),(1,2,8))
self.helper_test_left_inverse(test)
test = Layout((2,4,6),(4,1,8))
self.helper_test_left_inverse(test)
test = Layout((4,2),(1,16))
self.helper_test_left_inverse(test)
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,96 @@
#################################################################################################
#
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Unit tests for pycute.left_inverse
"""
import logging
import unittest
from pycute import *
_LOGGER = logging.getLogger(__name__)
class TestRightInverse(unittest.TestCase):
def helper_test_right_inverse(self, layout):
inv_layout = right_inverse(layout)
_LOGGER.debug(f"{layout} => {inv_layout}")
for i in range(size(inv_layout)):
self.assertEqual(layout(inv_layout(i)), i)
def test_right_inverse(self):
test = Layout(1,0)
self.helper_test_right_inverse(test)
test = Layout((1,1),(0,0))
self.helper_test_right_inverse(test)
test = Layout((3,7),(0,0))
self.helper_test_right_inverse(test)
test = Layout(1,1)
self.helper_test_right_inverse(test)
test = Layout(4,0)
self.helper_test_right_inverse(test)
test = Layout(4,1)
self.helper_test_right_inverse(test)
test = Layout(4,2)
self.helper_test_right_inverse(test)
test = Layout((2,4),(0,2))
self.helper_test_right_inverse(test)
test = Layout((8,4),(1,8))
self.helper_test_right_inverse(test)
test = Layout((8,4),(4,1))
self.helper_test_right_inverse(test)
test = Layout((2,4,6),(1,2,8))
self.helper_test_right_inverse(test)
test = Layout((2,4,6),(4,1,8))
self.helper_test_right_inverse(test)
test = Layout((4,2),(1,16))
self.helper_test_right_inverse(test)
if __name__ == "__main__":
unittest.main()

View File

@ -1,6 +1,6 @@
#################################################################################################
#
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
@ -30,12 +30,30 @@
#
#################################################################################################
import cutlass.backend
"""
Unit tests for pycute.typing
"""
import logging
import unittest
from pycute import *
_LOGGER = logging.getLogger(__name__)
class TestTyping(unittest.TestCase):
def helper_test_typing(self, _cls, _obj, cls, expected: bool):
_LOGGER.debug(f"issubclass({_cls}, {cls})")
_LOGGER.debug(f"isinstance({_obj}, {cls})")
self.assertEqual(expected, issubclass(_cls, cls))
self.assertEqual(expected, isinstance(_obj, cls))
def test_typing(self):
self.helper_test_typing(int, 1, Integer, True)
self.helper_test_typing(float, 1., Integer, False)
self.helper_test_typing(str, 'hi', Integer, False)
self.helper_test_typing(bool, False, Integer, False)
if __name__ == '__main__':
cutlass.backend.get_memory_pool(2**30, 2**30)
loader = unittest.TestLoader()
tests = loader.discover('./', 'gemm_*.py')
testRunner = unittest.runner.TextTestRunner()
testRunner.run(tests)
unittest.main()