CUTLASS 3.2.1 (#1113)
* Updates for 3.2.1 release. * Minor fix in gemm op profiler for raster order. * Add scheduler mapping for raster order in the kernels.
This commit is contained in:
@ -1,233 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
|
||||
from cutlass.backend.conv2d_operation import *
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dDgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,209 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dDgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32], stages=4,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3_64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4_64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=4,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,130 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend.conv2d_operation import *
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dDgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[1, 1, 1],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=1)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 8], stages=4,
|
||||
warp_count=[4, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[1, 1, 1],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=1)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 8], stages=4,
|
||||
warp_count=[2, 4, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,127 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dDgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 16], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 16], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,196 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
def conv2d_few_channel_problemsizes(channels):
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
|
||||
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
|
||||
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
|
||||
cutlass_bindings.Tensor4DCoord(16, 7, 7, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
return problem_sizes
|
||||
|
||||
class Conv2dFpropFewChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=2)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=2)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.few_channels,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(2)))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_1(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=1)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=1)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32], stages=2,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.few_channels,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(1)))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,220 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
def conv2d_fixed_channel_problemsizes(channels):
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
|
||||
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
return problem_sizes
|
||||
|
||||
class Conv2dFpropFixedChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(8)))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(4)))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=2)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=2)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.fixed_channels,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(2)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,341 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dFpropImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=2)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=2)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
|
||||
cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
|
||||
cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=2)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=2)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
|
||||
cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
|
||||
cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 28),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 28),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 23, 56, 100),
|
||||
cutlass_bindings.Tensor4DCoord(128, 3, 3, 100),
|
||||
cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,86 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dFpropImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,128 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend.conv2d_operation import *
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dFpropImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[1, 1, 1],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=1)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 8], stages=4,
|
||||
warp_count=[4, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle2
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[1, 1, 1],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=1)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 8], stages=4,
|
||||
warp_count=[2, 4, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,139 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dFpropImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 16], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align2(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=2)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=2)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 16], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.fprop, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
)
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,285 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dStridedDgradImplicitGemmF16NHWCF16NHWCF32NHWCTensorOpF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x256_64x3_64x64x64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 256, 64], stages=3,
|
||||
warp_count=[2, 4, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Strided_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4_128x128_32x3_64x64x32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Strided_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_128x128_32x3_64x64x32_align4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.dgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.StridedDgradIdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 56, 56, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 1, 1, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 55, 55, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 1, 1, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,129 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dWgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
|
||||
def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment, math_inst.element_accumulator,
|
||||
cutlass_bindings.float16
|
||||
)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment, math_inst.element_accumulator,
|
||||
cutlass_bindings.float16
|
||||
)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,274 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dWgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
|
||||
def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 16], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 16], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_64x256_32x4_64x64x32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[64, 256, 32], stages=3,
|
||||
warp_count=[1, 4, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 16], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
def test_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_align4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 16], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,128 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend.conv2d_operation import *
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dWgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Wgrad_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[1, 1, 1],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=1)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 8], stages=4,
|
||||
warp_count=[2, 4, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[1, 1, 1],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.Simt,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=1)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 8], stages=4,
|
||||
warp_count=[2, 4, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,139 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dWgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 16], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Wgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_align1(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=1)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=1)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32,
|
||||
layout=cutlass_bindings.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass_bindings.conv.Operator.wgrad, iterator_algorithm=cutlass_bindings.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass_bindings.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 8, 8, 1),
|
||||
cutlass_bindings.Tensor4DCoord(1, 3, 3, 1),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,128 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class GemmBF16TensorOpSm80(unittest.TestCase):
|
||||
def SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32_64x128x64_32x64x64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.bfloat16, element_b=cutlass_bindings.bfloat16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[64, 128, 64],
|
||||
stages=4, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.bfloat16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.bfloat16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
def test_SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32_128x256x64_64x64x64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.bfloat16, element_b=cutlass_bindings.bfloat16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[64, 128, 32],
|
||||
stages=6, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.bfloat16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=8
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.bfloat16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=8
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.bfloat16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=8
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass_bindings.float32)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "multistage"))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**30, 2**30)
|
||||
unittest.main()
|
||||
@ -1,138 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
from functools import partial
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend import library
|
||||
from cutlass.backend.test import *
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, get_name
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
name_fn = partial(get_name, element_a=cutlass_bindings.bfloat16, element_b=cutlass_bindings.bfloat16, arch=90)
|
||||
|
||||
def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
|
||||
cluster_shape, threadblock_shape, stages, opclass, persistent=False):
|
||||
"""
|
||||
Create a test-running function with the given specification and set it as a method of `cls`.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param layouts: indexable container of layouts of A, B, and C operands
|
||||
:param alignments: indexable container of alignments of A, B, and C operands
|
||||
:param element_output: data type of the output element
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:param element_epilogue: data type used in computing the epilogue
|
||||
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
|
||||
:param threadblock_shape: indexable container of dimensions of threadblock tiles
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass_bindings.OpClass
|
||||
:param persistent: whether this is a persistent warp-specialized kernel
|
||||
:type persistent: bool
|
||||
"""
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
element_A = cutlass_bindings.bfloat16
|
||||
element_B = cutlass_bindings.bfloat16
|
||||
inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
|
||||
warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=inst_shape,
|
||||
element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
|
||||
opcode_class=opclass, math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=threadblock_shape,
|
||||
cluster_shape=cluster_shape,
|
||||
stages=stages, warp_count=warp_count,
|
||||
math_instruction=math_inst,
|
||||
persistent=persistent
|
||||
)
|
||||
|
||||
A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
|
||||
B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
|
||||
C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
|
||||
|
||||
epilogue_functor = LinearCombination(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=90, tile_description=tile_description, A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
if persistent:
|
||||
suffix = "_persistent"
|
||||
else:
|
||||
suffix = ""
|
||||
|
||||
name = name_fn(layouts, alignments, element_output, element_accumulator,
|
||||
element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass, suffix=suffix)
|
||||
setattr(cls, name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
|
||||
class GemmBF16Sm90(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
add_test_tensorop = partial(add_test, opclass=cutlass_bindings.OpClass.TensorOp)
|
||||
add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
|
||||
|
||||
add_test_tensorop(GemmBF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], 3)
|
||||
add_test_tensorop(GemmBF16Sm90, LayoutCombination.NNN, [4, 4, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], 5)
|
||||
add_test_tensorop(GemmBF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmBF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [128, 128, 32], None, persistent=True)
|
||||
add_test_simt(GemmBF16Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass_bindings.bfloat16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 8], 2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**30, 2**30)
|
||||
unittest.main()
|
||||
@ -1,479 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class GemmF16Sm80(unittest.TestCase):
|
||||
def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32],
|
||||
stages=3, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=8
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor,
|
||||
direct_store=True
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
def test_SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32_128x128x64_64x64x64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64],
|
||||
stages=3, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=8
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
def test_SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32_128x256x64_64x64x64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 256, 64],
|
||||
stages=3, warp_count=[2, 4, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
def test_SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32_256x128x64_64x64x64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[256, 128, 64],
|
||||
stages=3, warp_count=[4, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
def test_SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 64, 64],
|
||||
stages=3, warp_count=[2, 1, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=8
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float16
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
def test_SM80_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32_64x64x32_32x32x32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float16, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[64, 64, 32],
|
||||
stages=10, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=8
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float16
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
def test_SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32_256x128x64_64x64x64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[256, 128, 64],
|
||||
stages=3, warp_count=[4, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=8
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=8
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
def test_test_SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k_128x64x64_64x64x32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 64, 64],
|
||||
stages=3, warp_count=[2, 1, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=8
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
def test_SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32_128x256x64_64x64x64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 256, 64],
|
||||
stages=3, warp_count=[2, 4, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=8
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.RowMajor,
|
||||
alignment=8
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
def test_SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32_128x256x64_64x64x64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 256, 64],
|
||||
stages=3, warp_count=[2, 4, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**30, 2**30)
|
||||
unittest.main()
|
||||
@ -1,182 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
from functools import partial
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend import library
|
||||
from cutlass.backend.test import *
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, get_name
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
# Partial specialziation for naming tests
|
||||
name_fn = partial(get_name, element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16, arch=90)
|
||||
|
||||
|
||||
def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
|
||||
cluster_shape, threadblock_shape, stages, opclass, persistent=False):
|
||||
"""
|
||||
Create a test-running function with the given specification and set it as a method of `cls`.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param layouts: indexable container of layouts of A, B, and C operands
|
||||
:param alignments: indexable container of alignments of A, B, and C operands
|
||||
:param element_output: data type of the output element
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:param element_epilogue: data type used in computing the epilogue
|
||||
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
|
||||
:param threadblock_shape: indexable container of dimensions of threadblock tiles
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass_bindings.OpClass
|
||||
:param persistent: whether this is a persistent warp-specialized kernel
|
||||
:type persistent: bool
|
||||
"""
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
|
||||
element_A = cutlass_bindings.float16
|
||||
element_B = cutlass_bindings.float16
|
||||
inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
|
||||
warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=inst_shape,
|
||||
element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
|
||||
opcode_class=opclass, math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=threadblock_shape,
|
||||
cluster_shape=cluster_shape,
|
||||
stages=stages, warp_count=warp_count,
|
||||
math_instruction=math_inst,
|
||||
persistent=persistent
|
||||
)
|
||||
|
||||
A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
|
||||
B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
|
||||
C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
|
||||
|
||||
epilogue_functor = LinearCombination(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=90, tile_description=tile_description, A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
if persistent:
|
||||
suffix = "_persistent"
|
||||
else:
|
||||
suffix = ""
|
||||
|
||||
name = name_fn(layouts, alignments, element_output, element_accumulator,
|
||||
element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass, suffix=suffix)
|
||||
setattr(cls, name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
|
||||
class GemmF16Sm90(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
add_test_tensorop = partial(add_test, opclass=cutlass_bindings.OpClass.TensorOp)
|
||||
add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
|
||||
|
||||
# Tests with 1x1x1 clusters
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], 3)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.NNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.NTT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 64, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 64, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [4, 4, 8], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 32], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [8, 8, 8], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 64, 64], 5)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNT, [2, 2, 2], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 32], None)
|
||||
|
||||
# Tests with different cluster shapes
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TNN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.NTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.NNN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 4, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 4, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 1, 1], [64, 128, 64], None)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 2, 1], [64, 128, 64], None)
|
||||
|
||||
# Tests for persistent warp-specialized threadblocks
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 128, 64], None, persistent=True)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [64, 128, 64], None, persistent=True)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 64], None, persistent=True)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 1, 1], [128, 128, 64], None, persistent=True)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 2, 1], [64, 128, 64], None, persistent=True)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 2, 1], [64, 128, 64], None, persistent=True)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [1, 4, 1], [64, 128, 64], None, persistent=True)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [2, 4, 1], [64, 128, 64], None, persistent=True)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 1, 1], [64, 128, 64], None, persistent=True)
|
||||
add_test_tensorop(GemmF16Sm90, LayoutCombination.TTN, [8, 8, 8], cutlass_bindings.float32, cutlass_bindings.float32, cutlass_bindings.float32, [4, 4, 1], [64, 128, 64], None, persistent=True)
|
||||
|
||||
# Tests using SIMT
|
||||
add_test_simt(GemmF16Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 128, 8], 2)
|
||||
add_test_simt(GemmF16Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 128, 8], 2)
|
||||
add_test_simt(GemmF16Sm90, LayoutCombination.NTN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [128, 64, 8], 2)
|
||||
add_test_simt(GemmF16Sm90, LayoutCombination.TTN, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float32, cutlass_bindings.float32, [1, 1, 1], [64, 64, 8], 2)
|
||||
add_test_simt(GemmF16Sm90, LayoutCombination.NNT, [1, 1, 1], cutlass_bindings.float16, cutlass_bindings.float16, cutlass_bindings.float16, [1, 1, 1], [128, 128, 8], 2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**30, 2**30)
|
||||
unittest.main()
|
||||
@ -1,178 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.memory_manager import get_allocated_size
|
||||
from cutlass.backend.test import *
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class GemmF32nF32nF32nTensorOpF32Sm80(unittest.TestCase):
|
||||
def test_SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32_128x128x32_64x64x32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add_fast_bf16
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32],
|
||||
stages=3, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
|
||||
alignment=4
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=4
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
|
||||
def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_f32_128x128x32_64x64x32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32],
|
||||
stages=3, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=4
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=4
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
def test_SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_fast_accurate_f32_64x64x32_32x32x32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass_bindings.float32, element_b=cutlass_bindings.float32,
|
||||
element_accumulator=cutlass_bindings.float32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add_fast_f32
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[64, 64, 32],
|
||||
stages=3, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=4
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=4
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**24, 2**24)
|
||||
cutlass.backend.compiler.load_from_cache()
|
||||
unittest.main()
|
||||
@ -1,134 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class GemmF64TensorOpSm80(unittest.TestCase):
|
||||
def test_SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64_32x32x16_16x16x16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[8, 8, 4],
|
||||
element_a=cutlass_bindings.float64, element_b=cutlass_bindings.float64,
|
||||
element_accumulator=cutlass_bindings.float64, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[32, 32, 16],
|
||||
stages=4, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
# alignment 1 restricted for double
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float64, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=1
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
|
||||
alignment=1
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
|
||||
alignment=1
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float64
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
def test_SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64_64x64x16_32x32x16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[8, 8, 4],
|
||||
element_a=cutlass_bindings.float64, element_b=cutlass_bindings.float64,
|
||||
element_accumulator=cutlass_bindings.float64, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[64, 64, 16],
|
||||
stages=4, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
# alignment 1 restricted for double
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
|
||||
alignment=1
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float64, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=1
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
|
||||
alignment=1
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float64
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**30, 2**30)
|
||||
unittest.main()
|
||||
@ -1,124 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
from functools import partial
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend import library
|
||||
from cutlass.backend.test import *
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, get_name
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
name_fn = partial(get_name, element_a=cutlass_bindings.float64, element_b=cutlass_bindings.float64, arch=90)
|
||||
|
||||
def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
|
||||
cluster_shape, threadblock_shape, stages, opclass):
|
||||
"""
|
||||
Create a test-running function with the given specification and set it as a method of `cls`.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param layouts: indexable container of layouts of A, B, and C operands
|
||||
:param alignments: indexable container of alignments of A, B, and C operands
|
||||
:param element_output: data type of the output element
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:param element_epilogue: data type used in computing the epilogue
|
||||
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
|
||||
:param threadblock_shape: indexable container of dimensions of threadblock tiles
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass_bindings.OpClass
|
||||
"""
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
element_A = cutlass_bindings.float64
|
||||
element_B = cutlass_bindings.float64
|
||||
inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
|
||||
warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=inst_shape,
|
||||
element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
|
||||
opcode_class=opclass, math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=threadblock_shape,
|
||||
cluster_shape=cluster_shape,
|
||||
stages=stages, warp_count=warp_count,
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
|
||||
B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
|
||||
C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
|
||||
|
||||
epilogue_functor = LinearCombination(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=90, tile_description=tile_description, A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
name = name_fn(layouts, alignments, element_output, element_accumulator,
|
||||
element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass)
|
||||
setattr(cls, name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
|
||||
class GemmF64Sm90(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
|
||||
add_test_simt(GemmF64Sm90, LayoutCombination.NNN, [1, 1, 1], cutlass_bindings.float64, cutlass_bindings.float64, cutlass_bindings.float64, [1, 1, 1], [64, 64, 32], 2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**30, 2**30)
|
||||
unittest.main()
|
||||
@ -1,235 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.test import *
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.gemm_grouped_testbed import TestbedGrouped
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class GemmGroupedSm80(unittest.TestCase):
|
||||
def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16], element_a=cutlass_bindings.float16,
|
||||
element_b=cutlass_bindings.float16, element_accumulator=cutlass_bindings.float32,
|
||||
opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32],
|
||||
stages=3, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
|
||||
|
||||
for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
|
||||
operation = GemmOperationGrouped(
|
||||
80,
|
||||
tile_description, A, B, C,
|
||||
epilogue_functor, swizzling_functor,
|
||||
precompute_mode=precompute_mode
|
||||
)
|
||||
|
||||
testbed = TestbedGrouped(operation=operation)
|
||||
|
||||
self.assertTrue(testbed.run(24))
|
||||
|
||||
def test_SM80_Device_GemmGrouped_f64t_f64t_f64n_tensor_op_f64_64x64x16_32x32x16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[8, 8, 4], element_a=cutlass_bindings.float64,
|
||||
element_b=cutlass_bindings.float64, element_accumulator=cutlass_bindings.float64,
|
||||
opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[64, 64, 16],
|
||||
stages=4, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
|
||||
alignment=1
|
||||
)
|
||||
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float64, layout=cutlass_bindings.RowMajor,
|
||||
alignment=1
|
||||
)
|
||||
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float64, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=1
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float64
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
|
||||
|
||||
for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
|
||||
operation = GemmOperationGrouped(
|
||||
80,
|
||||
tile_description, A, B, C,
|
||||
epilogue_functor, swizzling_functor,
|
||||
precompute_mode=precompute_mode
|
||||
)
|
||||
|
||||
testbed = TestbedGrouped(operation=operation)
|
||||
|
||||
self.assertTrue(testbed.run(24))
|
||||
|
||||
def test_SM80_Device_GemmGrouped_f32t_f32t_f32t_simt_f32_128x64x8_64x32x1(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[1, 1, 1], element_a=cutlass_bindings.float32,
|
||||
element_b=cutlass_bindings.float32, element_accumulator=cutlass_bindings.float32,
|
||||
opcode_class=cutlass_bindings.OpClass.Simt,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 64, 8],
|
||||
stages=4, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
|
||||
alignment=1
|
||||
)
|
||||
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
|
||||
alignment=1
|
||||
)
|
||||
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.RowMajor,
|
||||
alignment=1
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
|
||||
|
||||
for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
|
||||
operation = GemmOperationGrouped(
|
||||
80,
|
||||
tile_description, A, B, C,
|
||||
epilogue_functor, swizzling_functor,
|
||||
precompute_mode=precompute_mode
|
||||
)
|
||||
|
||||
testbed = TestbedGrouped(operation=operation)
|
||||
|
||||
self.assertTrue(testbed.run(27))
|
||||
|
||||
def test_SM80_Device_GemmGrouped_f16n_f16t_f32n_tensor_op_f32_128x128x32_64x64x32_cache(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16], element_a=cutlass_bindings.float16,
|
||||
element_b=cutlass_bindings.float16, element_accumulator=cutlass_bindings.float32,
|
||||
opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32],
|
||||
stages=3, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.float16, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=8
|
||||
)
|
||||
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.float32, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.float32
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, element_epilogue)
|
||||
swizzling_functor = cutlass_bindings.BatchedIdentitySwizzle
|
||||
|
||||
for precompute_mode in [SchedulerMode.Device, SchedulerMode.Host]:
|
||||
operation = GemmOperationGrouped(
|
||||
80,
|
||||
tile_description, A, B, C,
|
||||
epilogue_functor, swizzling_functor,
|
||||
precompute_mode=precompute_mode
|
||||
)
|
||||
|
||||
testbed = TestbedGrouped(operation=operation)
|
||||
|
||||
self.assertTrue(testbed.run(5))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**30, 2**30)
|
||||
unittest.main()
|
||||
@ -1,261 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend.epilogue import LinearCombinationClamp
|
||||
from cutlass.backend.test import *
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class GemmS8TensorOpF32Sm80(unittest.TestCase):
|
||||
def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_64x64x64_32x32x64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 32],
|
||||
element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
|
||||
element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add_saturate
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[64, 64, 64],
|
||||
stages=6, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajorInterleaved32,
|
||||
alignment=16
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajorInterleaved32,
|
||||
alignment=16
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajorInterleaved32,
|
||||
alignment=8
|
||||
)
|
||||
|
||||
epilogue_functor = FastLinearCombinationClamp(
|
||||
C.element, C.alignment
|
||||
)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "interleaved"))
|
||||
|
||||
def test_SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32_256x128x128_64x64x128(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 32],
|
||||
element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
|
||||
element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 128],
|
||||
stages=3, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
|
||||
alignment=16
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=16
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
|
||||
alignment=16
|
||||
)
|
||||
|
||||
epilogue_functor = FastLinearCombinationClamp(
|
||||
C.element, C.alignment
|
||||
)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "multistage"))
|
||||
|
||||
def test_SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32_128x128x128_64x64x128(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 32],
|
||||
element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
|
||||
element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 128],
|
||||
stages=3, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
|
||||
alignment=16
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=16
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=16
|
||||
)
|
||||
|
||||
epilogue_functor = FastLinearCombinationClamp(
|
||||
C.element, C.alignment
|
||||
)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "multistage"))
|
||||
|
||||
def test_SM80_Device_Gemm_s8t_s8n_s32n_tensor_op_s32_128x128x128_64x64x128(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 32],
|
||||
element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
|
||||
element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 128],
|
||||
stages=3, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
|
||||
alignment=16
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=16
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.int32, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.int32
|
||||
|
||||
epilogue_functor = LinearCombinationClamp(
|
||||
C.element, C.alignment, math_inst.element_accumulator,
|
||||
element_epilogue
|
||||
)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "multistage"))
|
||||
|
||||
def test_SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32_128x128x128_64x64x128(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 32],
|
||||
element_a=cutlass_bindings.int8, element_b=cutlass_bindings.int8,
|
||||
element_accumulator=cutlass_bindings.int32, opcode_class=cutlass_bindings.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 128],
|
||||
stages=3, warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.RowMajor,
|
||||
alignment=16
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass_bindings.int8, layout=cutlass_bindings.ColumnMajor,
|
||||
alignment=16
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass_bindings.int32, layout=cutlass_bindings.RowMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass_bindings.int32
|
||||
|
||||
epilogue_functor = LinearCombinationClamp(
|
||||
C.element, C.alignment, math_inst.element_accumulator,
|
||||
element_epilogue
|
||||
)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "multistage"))
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**30, 2**30)
|
||||
unittest.main()
|
||||
@ -1,154 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
from functools import partial
|
||||
import cutlass.backend
|
||||
from cutlass.backend import *
|
||||
from cutlass.backend import library
|
||||
from cutlass.backend.test import *
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, get_name
|
||||
from cutlass.backend.test.gemm_testbed import test_all_gemm
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
|
||||
name_fn = partial(get_name, element_a=cutlass_bindings.float16, element_b=cutlass_bindings.float16, arch=90)
|
||||
|
||||
def add_test(cls, layouts, alignments, element_output, element_accumulator, element_epilogue,
|
||||
cluster_shape, threadblock_shape, stages, opclass, persistent=False):
|
||||
"""
|
||||
Create a test-running function with the given specification and set it as a method of `cls`.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param layouts: indexable container of layouts of A, B, and C operands
|
||||
:param alignments: indexable container of alignments of A, B, and C operands
|
||||
:param element_output: data type of the output element
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:param element_epilogue: data type used in computing the epilogue
|
||||
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
|
||||
:param threadblock_shape: indexable container of dimensions of threadblock tiles
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass_bindings.OpClass
|
||||
:param persistent: whether this is a persistent warp-specialized kernel
|
||||
:type persistent: bool
|
||||
"""
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
element_A = cutlass_bindings.int8
|
||||
element_B = cutlass_bindings.int8
|
||||
inst_shape = [1, 1, 1] if opclass == cutlass_bindings.OpClass.Simt else None
|
||||
warp_count = [2, 2, 1] if opclass == cutlass_bindings.OpClass.Simt else None
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=inst_shape,
|
||||
element_a=element_A, element_b=element_B, element_accumulator=element_accumulator,
|
||||
opcode_class=opclass, math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=threadblock_shape,
|
||||
cluster_shape=cluster_shape,
|
||||
stages=stages, warp_count=warp_count,
|
||||
math_instruction=math_inst,
|
||||
persistent=persistent
|
||||
)
|
||||
|
||||
A = TensorDescription(element=element_A, layout=layouts[0], alignment=alignments[0])
|
||||
B = TensorDescription(element=element_B, layout=layouts[1], alignment=alignments[1])
|
||||
C = TensorDescription(element=element_output, layout=layouts[2], alignment=alignments[2])
|
||||
|
||||
if opclass == cutlass_bindings.OpClass.Simt:
|
||||
epilogue_functor_cls = LinearCombinationClamp
|
||||
else:
|
||||
epilogue_functor_cls = LinearCombination
|
||||
epilogue_functor = epilogue_functor_cls(C.element, C.alignment, math_inst.element_accumulator, element_epilogue)
|
||||
|
||||
swizzling_functor = cutlass_bindings.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=90, tile_description=tile_description, A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor)
|
||||
|
||||
self.assertTrue(test_all_gemm(operation, "universal"))
|
||||
|
||||
if persistent:
|
||||
suffix = "_persistent"
|
||||
else:
|
||||
suffix = ""
|
||||
|
||||
name = name_fn(layouts, alignments, element_output, element_accumulator,
|
||||
element_epilogue, cluster_shape, threadblock_shape, stages, opclass=opclass, suffix=suffix)
|
||||
setattr(cls, name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 90, "Device compute capability is insufficient for SM90 tests.")
|
||||
class GemmS8Sm90(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
add_test_tensorop = partial(add_test, opclass=cutlass_bindings.OpClass.TensorOp)
|
||||
add_test_simt = partial(add_test, opclass=cutlass_bindings.OpClass.Simt)
|
||||
|
||||
# Tests with 1x1x1 clusters
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNN, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], 3)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], None)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 8], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], None)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [64, 128, 128], None)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 64, 32], None)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [4, 4, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [128, 128, 128], None)
|
||||
|
||||
# Tests with different cluster shapes
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [2, 2, 1], [128, 128, 128], None)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 4, 1], [128, 128, 128], None)
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [4, 4, 1], [128, 128, 128], None)
|
||||
|
||||
# Tests with persistent warp-specialized threadblocks
|
||||
add_test_tensorop(GemmS8Sm90, LayoutCombination.TNT, [16, 16, 16], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [2, 1, 1], [128, 128, 128], None, persistent=True)
|
||||
|
||||
# Tests for SIMT
|
||||
add_test_simt(GemmS8Sm90, LayoutCombination.TNN, [1, 1, 1], cutlass_bindings.int8, cutlass_bindings.int32, cutlass_bindings.int32, [1, 1, 1], [64, 32, 8], 2)
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**30, 2**30)
|
||||
unittest.main()
|
||||
@ -1,508 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Util Functions for Conv2d Test
|
||||
"""
|
||||
import torch
|
||||
import cutlass
|
||||
import unittest
|
||||
import cutlass_bindings
|
||||
from cutlass.utils.datatypes import binding_type, binding_opclass
|
||||
from cutlass.backend.test.conv2d_testbed import Conv2dLauncher, getTensorRef, getTensorView
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
from cutlass.backend.test.utils import get_name_conv2d
|
||||
import numpy as np
|
||||
|
||||
def conv2d_few_channel_problemsizes(channels):
|
||||
problem_sizes = [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
|
||||
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
|
||||
cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
|
||||
cutlass_bindings.Tensor4DCoord(16, 7, 7, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
|
||||
cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass_bindings.MatrixCoord(2, 2),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
return problem_sizes
|
||||
|
||||
torch_dtype = {
|
||||
cutlass.DataType.f16: torch.float16,
|
||||
cutlass.DataType.f32: torch.float32,
|
||||
cutlass.DataType.f64: torch.float64
|
||||
}
|
||||
|
||||
numpy_dtype = {
|
||||
cutlass.DataType.f16: np.float16,
|
||||
cutlass.DataType.f32: np.float32,
|
||||
cutlass.DataType.f64: np.float64
|
||||
}
|
||||
|
||||
|
||||
def validate_problem_size(ps, conv_kind, split_k_slices):
|
||||
P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
|
||||
Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
|
||||
if P != ps.P or Q != ps.Q:
|
||||
return False
|
||||
|
||||
# Split-K (serial or parallel) is not supported for strided dgrad
|
||||
if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
# Override the backend launcher
|
||||
class Conv2dLauncherFrontend(Conv2dLauncher):
|
||||
def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
|
||||
self.operation = plan
|
||||
self.conv_kind = plan.conv_kind
|
||||
self.seed = seed
|
||||
self.backend = backend
|
||||
|
||||
self.dtype_A = plan._element_a
|
||||
self.dtype_B = plan._element_b
|
||||
self.dtype_C = plan._element_c
|
||||
self.dtype_acc = plan._element_accumulator
|
||||
|
||||
self.layout_A = cutlass_bindings.TensorNHWC
|
||||
self.layout_B = cutlass_bindings.TensorNHWC
|
||||
self.layout_C = cutlass_bindings.TensorNHWC
|
||||
self.layout_D = cutlass_bindings.TensorNHWC
|
||||
|
||||
self.element_compute = cutlass_bindings.float32
|
||||
self.enable_cached_results = True
|
||||
|
||||
# Get randomization_max
|
||||
if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
|
||||
if self.dtype_acc in [cutlass.DataType.f16, cutlass.DataType.bf16]:
|
||||
self.randomization_max = 2
|
||||
else:
|
||||
self.randomization_max = 3
|
||||
else:
|
||||
self.randomization_max = 7
|
||||
|
||||
self.activation = plan.activation
|
||||
|
||||
self.host_conv2d = cutlass_bindings.test.conv.host.conv2d
|
||||
|
||||
|
||||
def set_seed(self):
|
||||
if self.backend == "numpy":
|
||||
np.random.seed(self.seed)
|
||||
else:
|
||||
torch.manual_seed(self.seed)
|
||||
|
||||
def uniform_init(self, size, dtype):
|
||||
if self.backend == "numpy":
|
||||
return super().uniform_init(size, numpy_dtype[dtype])
|
||||
else:
|
||||
tensor = torch.ceil(
|
||||
torch.empty(size=size, dtype=torch_dtype[dtype], device="cuda").uniform_(-self.randomization_max - 0.5, self.randomization_max - 0.5)
|
||||
).to(memory_format=torch.channels_last)
|
||||
return tensor
|
||||
|
||||
def zeros_like(self, tensor):
|
||||
if self.backend == "numpy":
|
||||
return np.zeros_like(tensor)
|
||||
else:
|
||||
return torch.zeros_like(tensor).to(memory_format=torch.channels_last)
|
||||
|
||||
def reference(self, ps, A, B, C, alpha, beta, activation):
|
||||
if self.backend == "numpy":
|
||||
numpy_result = self.host_reference(ps, A, B, C, alpha, beta, activation)
|
||||
return numpy_result
|
||||
else:
|
||||
if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
|
||||
torch_result = alpha * torch.ops.aten.conv2d(
|
||||
A,
|
||||
B,
|
||||
stride=(ps.stride_h, ps.stride_w),
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
dilation=(ps.dilation_h, ps.dilation_w)
|
||||
) + beta * C
|
||||
elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
|
||||
torch_result = alpha * torch.nn.grad.conv2d_input(
|
||||
(ps.N, ps.C, ps.H, ps.W),
|
||||
B,
|
||||
A,
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
stride=(ps.stride_h, ps.stride_w)
|
||||
) + beta * C
|
||||
elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
|
||||
torch_result = alpha * torch.nn.grad.conv2d_weight(
|
||||
B,
|
||||
(ps.K, ps.C, ps.R, ps.S),
|
||||
A,
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
stride=(ps.stride_h, ps.stride_w)
|
||||
) + beta * C
|
||||
else:
|
||||
raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
|
||||
|
||||
if activation == cutlass.backend.epilogue.relu:
|
||||
torch_result = torch.nn.functional.relu(torch_result)
|
||||
elif activation == cutlass.backend.epilogue.leaky_relu:
|
||||
torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
|
||||
|
||||
return torch_result
|
||||
|
||||
def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta, activation):
|
||||
if self.element_compute == cutlass_bindings.float16:
|
||||
alpha = cutlass_bindings.float16(alpha)
|
||||
beta = cutlass_bindings.float16(beta)
|
||||
elif self.element_compute == cutlass_bindings.int32:
|
||||
alpha = int(alpha)
|
||||
beta = int(beta)
|
||||
else:
|
||||
alpha = alpha
|
||||
beta = beta
|
||||
|
||||
# If cached result is loaded
|
||||
cached_result_loaded = False
|
||||
|
||||
if self.enable_cached_results:
|
||||
# Get problem key
|
||||
cached_test_key = cutlass_bindings.test.conv.host.CreateCachedConv2dTestKey(
|
||||
self.conv_kind,
|
||||
problem_size,
|
||||
alpha,
|
||||
beta,
|
||||
getTensorView(
|
||||
tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
|
||||
),
|
||||
getTensorView(
|
||||
tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
|
||||
),
|
||||
getTensorView(
|
||||
tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
|
||||
),
|
||||
)
|
||||
|
||||
cached_test_key.problem = cached_test_key.problem + f"_{activation.tag.split('::')[-1]}"
|
||||
|
||||
cached_test_result = cutlass_bindings.test.conv.host.CachedTestResult()
|
||||
|
||||
conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (
|
||||
self.operation.arch,
|
||||
self.seed,
|
||||
)
|
||||
|
||||
cached_results = cutlass_bindings.test.conv.host.CachedTestResultListing(
|
||||
conv2d_result_cache_name
|
||||
)
|
||||
# CachedTestResultListing cached_results(conv2d_result_cache_name);
|
||||
cached = cached_results.find(cached_test_key)
|
||||
cached_result_loaded = cached[0]
|
||||
if cached_result_loaded:
|
||||
cached_test_result = cached[1]
|
||||
|
||||
if not cached_result_loaded:
|
||||
# Compute the conv2d on host
|
||||
tensor_D_ref = np.ones_like(tensor_C)
|
||||
tensor_ref_A = getTensorRef(
|
||||
tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
|
||||
)
|
||||
tensor_ref_B = getTensorRef(
|
||||
tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
|
||||
)
|
||||
tensor_ref_C = getTensorRef(
|
||||
tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
|
||||
)
|
||||
tensor_ref_D_ref = getTensorRef(
|
||||
tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
|
||||
)
|
||||
|
||||
self.host_conv2d(
|
||||
self.conv_kind,
|
||||
problem_size,
|
||||
tensor_ref_A,
|
||||
tensor_ref_B,
|
||||
tensor_ref_C,
|
||||
tensor_ref_D_ref,
|
||||
alpha,
|
||||
beta,
|
||||
)
|
||||
|
||||
if activation == cutlass.backend.epilogue.leaky_relu:
|
||||
tensor_D_ref = activation.numpy(tensor_D_ref, 0.5)
|
||||
else:
|
||||
tensor_D_ref = activation.numpy(tensor_D_ref)
|
||||
|
||||
tensor_view_D_ref = getTensorView(
|
||||
tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
|
||||
)
|
||||
|
||||
if self.enable_cached_results:
|
||||
cached_test_result.D = cutlass_bindings.test.conv.host.TensorHash(
|
||||
tensor_view_D_ref
|
||||
)
|
||||
cached_results = (
|
||||
cutlass_bindings.test.conv.host.CachedTestResultListing(
|
||||
conv2d_result_cache_name
|
||||
)
|
||||
)
|
||||
cached_results.append(cached_test_key, cached_test_result)
|
||||
cached_results.write(conv2d_result_cache_name)
|
||||
else:
|
||||
return tensor_D_ref
|
||||
|
||||
return cached_test_result.D
|
||||
|
||||
def equal(self, tensor_D, tensor_D_ref, problem_size):
|
||||
if self.backend == "numpy":
|
||||
return super().equal(tensor_D, tensor_D_ref, problem_size)
|
||||
else:
|
||||
torch.cuda.synchronize()
|
||||
return torch.equal(tensor_D, tensor_D_ref)
|
||||
|
||||
|
||||
def run(self, ps, split_k_mode=cutlass_bindings.conv.SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
|
||||
|
||||
#
|
||||
# Initialize input and output tensors
|
||||
#
|
||||
if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
|
||||
if self.backend == "torch":
|
||||
tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
else:
|
||||
tensor_A_size = (ps.N, ps.H, ps.W, ps.C)
|
||||
tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
|
||||
tensor_C_size = (ps.N, ps.P, ps.Q, ps.K)
|
||||
elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
|
||||
if self.backend == "torch":
|
||||
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
else:
|
||||
tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
|
||||
tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
|
||||
tensor_C_size = (ps.N, ps.H, ps.W, ps.C)
|
||||
elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
|
||||
if self.backend == "torch":
|
||||
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
else:
|
||||
tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
|
||||
tensor_B_size = (ps.N, ps.H, ps.W, ps.C)
|
||||
tensor_C_size = (ps.K, ps.R, ps.S, ps.C)
|
||||
else:
|
||||
raise Exception(f"Conv kind {self.conv_kind} is not supported")
|
||||
|
||||
self.set_seed()
|
||||
|
||||
tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
|
||||
tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
|
||||
tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
|
||||
tensor_D = self.zeros_like(tensor_C)
|
||||
|
||||
self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
|
||||
stride=(ps.stride_h, ps.stride_w),
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
dilation=(ps.dilation_h, ps.dilation_w),
|
||||
alpha=alpha, beta=beta,
|
||||
split_k=(split_k_mode, split_k_slices))
|
||||
|
||||
tensor_D_ref = self.reference(
|
||||
ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation
|
||||
)
|
||||
|
||||
return self.equal(tensor_D, tensor_D_ref, ps)
|
||||
|
||||
|
||||
def add_test(
|
||||
cls,
|
||||
cc,
|
||||
conv_kind,
|
||||
problem_sizes,
|
||||
element,
|
||||
element_accumulator,
|
||||
element_output,
|
||||
opclass,
|
||||
threadblock_shape,
|
||||
warp_count,
|
||||
instruction_shape,
|
||||
stages,
|
||||
iterator_algorithm=None,
|
||||
swizzle=None,
|
||||
split_k_mode="serial",
|
||||
split_k_slices=1,
|
||||
activation = "identity"
|
||||
):
|
||||
"""Create a test-running function with the given specification"""
|
||||
test_name = get_name_conv2d(
|
||||
cc, conv_kind, element, element_accumulator,
|
||||
element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
|
||||
iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
|
||||
|
||||
def run(self):
|
||||
# Create the plan
|
||||
plan = cutlass.Conv2d(
|
||||
kind=conv_kind,
|
||||
element=element,
|
||||
element_accumulator=element_accumulator,
|
||||
element_C=element_output,
|
||||
element_D=element_output
|
||||
)
|
||||
|
||||
# Set the opclass
|
||||
plan.opclass = opclass
|
||||
# Set the tile description
|
||||
td = {
|
||||
"threadblock_shape": threadblock_shape,
|
||||
"warp_count": warp_count,
|
||||
"stages": stages,
|
||||
"instruction_shape": instruction_shape,
|
||||
}
|
||||
|
||||
plan.tile_description = td
|
||||
# Set iterator algorithm
|
||||
if iterator_algorithm is not None:
|
||||
plan.iterator_algorithm = iterator_algorithm
|
||||
# Set swizzling functor
|
||||
if swizzle is not None:
|
||||
plan.swizzling_stride = swizzle
|
||||
|
||||
if activation != "identity":
|
||||
if activation == "leaky_relu":
|
||||
plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
|
||||
else:
|
||||
plan.activation = getattr(cutlass.epilogue, activation)
|
||||
|
||||
conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="numpy")
|
||||
|
||||
for ps in problem_sizes:
|
||||
if not validate_problem_size(ps, conv_kind, split_k_slices): continue
|
||||
|
||||
self.assertTrue(
|
||||
conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 0.5)
|
||||
)
|
||||
|
||||
setattr(cls, test_name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
def get_conv_problems():
|
||||
# 64: minimum channel size
|
||||
conv_problems = list(cutlass_bindings.test.conv.TestbedConv2dProblemSizes(64).conv2d_default_sizes)
|
||||
# Insert alignment 4 & 2 tests
|
||||
conv_problems += [
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
|
||||
cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
|
||||
cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
return conv_problems
|
||||
660
test/python/cutlass/conv2d/conv2d_problem_sizes.py
Normal file
660
test/python/cutlass/conv2d/conv2d_problem_sizes.py
Normal file
@ -0,0 +1,660 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Utilities for defining Conv2D problem sizes for testing.
|
||||
|
||||
This file was ported from the C++ version in test/unit/conv/device/conv2d_problems.h
|
||||
"""
|
||||
|
||||
import cutlass
|
||||
from cutlass import ConvMode
|
||||
from cutlass.shape import Conv2DProblemSize
|
||||
|
||||
|
||||
class TestbedConv2dProblemSizes:
|
||||
def __init__(self, minimum_channel_size: int):
|
||||
conv2d_default_sizes = self.initialize_conv2d_default_sizes(minimum_channel_size)
|
||||
conv2d_rigorous_sizes = self.initialize_conv2d_rigorous_sizes(minimum_channel_size)
|
||||
conv2d_resnet50_sizes = self.initialize_conv2d_resnet50_sizes(1)
|
||||
conv2d_resnet50_sizes_perf = self.initialize_conv2d_resnet50_sizes(34)
|
||||
grouped_sizes = self.initialize_conv2d_grouped_sizes()
|
||||
|
||||
# Filter all problems
|
||||
self.all = []
|
||||
for size_list in [conv2d_default_sizes, conv2d_rigorous_sizes, conv2d_resnet50_sizes, conv2d_resnet50_sizes_perf, grouped_sizes]:
|
||||
for size in size_list:
|
||||
if (size.C // size.groups) % minimum_channel_size == 0:
|
||||
self.all.append(size)
|
||||
|
||||
|
||||
def initialize_conv2d_default_sizes(self, minimum_channel_size):
|
||||
# Small input size x stride (1,1)
|
||||
# C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
|
||||
|
||||
conv2d_default_sizes = []
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 1, 1, minimum_channel_size,
|
||||
8, 1, 1, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 1, 8, minimum_channel_size,
|
||||
8, 1, 3, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 7, 8, minimum_channel_size,
|
||||
8, 3, 3, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 7, 9, minimum_channel_size,
|
||||
8, 4, 4, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
2, 7, 9, minimum_channel_size,
|
||||
8, 5, 5, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
3, 7, 9, minimum_channel_size,
|
||||
8, 6, 5, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
3, 7, 9, minimum_channel_size,
|
||||
8, 6, 6, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
3, 7, 9, minimum_channel_size,
|
||||
8, 7, 7, minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##############################################
|
||||
# Small input size x stride (2,2)
|
||||
# C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
|
||||
##############################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 11, 7, minimum_channel_size,
|
||||
8, 1, 1, minimum_channel_size,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 11, 7, minimum_channel_size,
|
||||
8, 3, 3, minimum_channel_size,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 13, 11, minimum_channel_size,
|
||||
8, 1, 1, minimum_channel_size,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 17, 19, minimum_channel_size,
|
||||
16, 2, 2, minimum_channel_size,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 23, 5, minimum_channel_size,
|
||||
16, 3, 3, minimum_channel_size,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 13, 17, 8,
|
||||
24, 3, 3, 8,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 23, 21, 8,
|
||||
24, 3, 3, 8,
|
||||
1, 1,
|
||||
3, 3,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 20, 24, 8,
|
||||
40, 3, 3, 8,
|
||||
3, 3,
|
||||
3, 3,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##########################################
|
||||
# Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1)
|
||||
##########################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 15, 19, 160,
|
||||
224, 1, 1, 160,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 19, 37, 160,
|
||||
224, 3, 3, 160,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 16, 16, 160,
|
||||
224, 2, 3, 160,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 23, 21, 128,
|
||||
224, 3, 3, 128,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 29, 37, 160,
|
||||
224, 5, 5, 160,
|
||||
2, 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##########################################
|
||||
# C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
|
||||
##########################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 15, 19, 32 + minimum_channel_size,
|
||||
96, 3, 3, 32 + minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 16, 24, 64 + minimum_channel_size,
|
||||
96, 3, 3, 64 + minimum_channel_size,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##########################################
|
||||
# Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)
|
||||
##########################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 13, 16, 288,
|
||||
160, 5, 5, 288,
|
||||
2, 2,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 55, 51, 256,
|
||||
512, 1, 1, 256,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 71, 80, 32,
|
||||
64, 5, 5, 32,
|
||||
2, 2,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 224, 224, 8,
|
||||
64, 7, 7, 8,
|
||||
3, 3,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##########################################
|
||||
# Medium input size stride (3, 3), filter (3, 3), non-default padding
|
||||
##########################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 27, 23, 256,
|
||||
512, 3, 3, 256,
|
||||
0, 0,
|
||||
3, 3,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##########################################
|
||||
# Medium input size padding > stride, asymmetric filter, padding and striding
|
||||
##########################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 27, 31, 256,
|
||||
512, 3, 3, 256,
|
||||
5, 7,
|
||||
3, 4,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 27, 35, 256,
|
||||
512, 7, 5, 256,
|
||||
11, 7,
|
||||
3, 5,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
##########################################
|
||||
# Medium input size *mixed* stride (1, 2) and (2, 1),
|
||||
# filter (3, 3), default padding
|
||||
##########################################
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 27, 27, 256,
|
||||
512, 3, 3, 256,
|
||||
1, 1,
|
||||
1, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 27, 27, 256,
|
||||
512, 3, 3, 256,
|
||||
1, 1,
|
||||
2, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
######################################/
|
||||
# Additional input size
|
||||
######################################/
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
3, 28, 28, 256,
|
||||
256, 2, 2, 256,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
1, 32, 32, 16,
|
||||
32, 3, 3, 16,
|
||||
1, 1,
|
||||
6, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
32, 24, 32, 32,
|
||||
32, 1, 2, 32,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_default_sizes.append(Conv2DProblemSize(
|
||||
4, 2, 3, 256,
|
||||
328, 3, 5, 256,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
return conv2d_default_sizes
|
||||
|
||||
# Add a few large and rigorous convolution problem sizes
|
||||
def initialize_conv2d_rigorous_sizes(self, minimum_channel_size):
|
||||
sizes = []
|
||||
if False:
|
||||
sizes.append(Conv2DProblemSize.from_sizes(
|
||||
(1, 124, 224, 2 * minimum_channel_size),
|
||||
(24, 7, 7, 2 * minimum_channel_size),
|
||||
))
|
||||
|
||||
sizes.append(Conv2DProblemSize.from_sizes(
|
||||
(1, 233, 35, minimum_channel_size),
|
||||
(24, 7, 5, minimum_channel_size),
|
||||
))
|
||||
return sizes
|
||||
|
||||
# Add resent50 layers to unit testing sizes
|
||||
def initialize_conv2d_resnet50_sizes(self, batch_size):
|
||||
conv2d_problem_vector = []
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 56, 56, 64,
|
||||
256, 1, 1, 64,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 56, 56, 64,
|
||||
64, 1, 1, 64,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 56, 56, 64,
|
||||
64, 3, 3, 64,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 56, 56, 256,
|
||||
64, 1, 1, 256,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 56, 56, 256,
|
||||
512, 1, 1, 256,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 56, 56, 256,
|
||||
128, 1, 1, 256,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 28, 28, 128,
|
||||
128, 3, 3, 128,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 28, 28, 128,
|
||||
512, 1, 1, 128,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 28, 28, 512,
|
||||
128, 1, 1, 512,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 28, 28, 512,
|
||||
1024, 1, 1, 512,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 28, 28, 512,
|
||||
256, 1, 1, 512,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 14, 14, 256,
|
||||
256, 3, 3, 256,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 14, 14, 256,
|
||||
1024, 1, 1, 256,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 14, 14, 1024,
|
||||
256, 1, 1, 1024,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 14, 14, 1024,
|
||||
2048, 1, 1, 1024,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 14, 14, 1024,
|
||||
512, 1, 1, 1024,
|
||||
0, 0,
|
||||
2, 2,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 7, 7, 512,
|
||||
512, 3, 3, 512,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 7, 7, 512,
|
||||
2048, 1, 1, 512,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
conv2d_problem_vector.append(Conv2DProblemSize(
|
||||
batch_size, 7, 7, 2048,
|
||||
512, 1, 1, 2048,
|
||||
0, 0,
|
||||
1, 1,
|
||||
1, 1,
|
||||
))
|
||||
|
||||
return conv2d_problem_vector
|
||||
|
||||
def initialize_conv2d_grouped_sizes(self):
|
||||
threadblock_n = 128
|
||||
threadblock_k = 32
|
||||
|
||||
sizes = []
|
||||
##########################################
|
||||
# One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
|
||||
# One CTA calculates a single group
|
||||
##########################################
|
||||
for cta_per_group_k in range(1, 4):
|
||||
for groups in range(2, 5):
|
||||
conv_k = cta_per_group_k * threadblock_n * groups
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 8, 8, threadblock_k * 2 * groups,
|
||||
conv_k, 3, 3, threadblock_k * 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
groups
|
||||
))
|
||||
|
||||
# Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 8, 8, threadblock_k,
|
||||
threadblock_n * 2, 3, 3, threadblock_k // 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
2
|
||||
))
|
||||
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 56, 56, 696,
|
||||
768, 3, 3, 232,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
3
|
||||
))
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 14, 14, 1392,
|
||||
1536, 3, 3, 232,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
3
|
||||
))
|
||||
|
||||
##########################################
|
||||
# One CTA calculate multiple groups: CTA::N % k_per_group = 0
|
||||
##########################################
|
||||
|
||||
# 2 groups per CTA
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 8, 8, threadblock_k * 4,
|
||||
threadblock_n, 3, 3, threadblock_k * 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
2
|
||||
))
|
||||
|
||||
# 2 groups per CTA and partial gemm_k
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 8, 8, threadblock_k,
|
||||
threadblock_n, 3, 3, threadblock_k // 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
2
|
||||
))
|
||||
|
||||
# 4 groups per CTA
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 8, 8, threadblock_k * 8,
|
||||
threadblock_n // 2, 3, 3, threadblock_k * 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
4
|
||||
))
|
||||
|
||||
# 4 groups per CTA and partial gemm_k
|
||||
sizes.append(Conv2DProblemSize(
|
||||
1, 8, 8, threadblock_k * 2,
|
||||
threadblock_n // 2, 3, 3, threadblock_k // 2,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1,
|
||||
4
|
||||
))
|
||||
|
||||
return sizes
|
||||
@ -31,56 +31,64 @@
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Low-level functionality tests for Conv2d operands on SM80
|
||||
Low-level functionality tests for Conv2d opreations on SM80
|
||||
"""
|
||||
from conv2d_test_utils import *
|
||||
import cutlass
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from conv2d_test_utils import *
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
|
||||
@unittest.skipIf(device_cc() != cc, 'Device compute capability is invalid for SM80 tests.')
|
||||
|
||||
@unittest.skipIf(device_cc() < cc, 'Device compute capability is invalid for SM80 tests.')
|
||||
class Conv2dSm80(unittest.TestCase):
|
||||
"""
|
||||
Wrapper class to which tests will be added dynamically in __main__
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
conv_problems = get_conv_problems()
|
||||
|
||||
|
||||
# Tests for optimized & analytic
|
||||
for conv_kind in ["fprop", "wgrad", "dgrad"]:
|
||||
# F16, simt
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="simt", threadblock_shape=[128, 128, 8],
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="simt", threadblock_shape=[128, 128, 8],
|
||||
warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
|
||||
# F16, tensor op
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
|
||||
# F16, tensor op, analytic iterator
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f16, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
|
||||
# F16, tensor op, f32 output
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f32,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
|
||||
# F16, tensor op, different tile description
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 64, 32],
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 64, 32],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
|
||||
# F32, simt
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
|
||||
opclass="simt", threadblock_shape=[128, 128, 8],
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f32, cutlass.DataType.f32, cutlass.DataType.f32,
|
||||
opclass="simt", threadblock_shape=[128, 128, 8],
|
||||
warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
|
||||
# Tf32, tensorop
|
||||
add_test(
|
||||
@ -90,19 +98,19 @@ for conv_kind in ["fprop", "wgrad", "dgrad"]:
|
||||
)
|
||||
# Split-K
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
|
||||
split_k_slices=2)
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
|
||||
split_k_slices=5)
|
||||
# Swizzling functor
|
||||
add_test(
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 64, 32],
|
||||
Conv2dSm80, cc, conv_kind, conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 64, 32],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)
|
||||
|
||||
# Tests for few channels and fixed channels
|
||||
@ -113,14 +121,14 @@ for c, tb, stage, inst in zip([2, 1],
|
||||
[[16, 8, 16], [16, 8, 8]]):
|
||||
add_test(
|
||||
Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=tb,
|
||||
opclass="tensor_op", threadblock_shape=tb,
|
||||
warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
|
||||
)
|
||||
# F16, tensor op, fixed channels
|
||||
for c in [8, 4, 2]:
|
||||
add_test(
|
||||
Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
|
||||
)
|
||||
|
||||
@ -128,11 +136,11 @@ for c in [8, 4, 2]:
|
||||
for activation in ["relu", "leaky_relu"]:
|
||||
for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
|
||||
add_test(
|
||||
Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
Conv2dSm80, cc, "fprop", conv_problems, cutlass.DataType.f16, cutlass.DataType.f32, cutlass.DataType.f16,
|
||||
opclass="tensor_op", threadblock_shape=[128, 128, 64],
|
||||
warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
|
||||
split_k_slices=split_k_slices, activation=activation)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
425
test/python/cutlass/conv2d/conv2d_test_utils.py
Normal file
425
test/python/cutlass/conv2d/conv2d_test_utils.py
Normal file
@ -0,0 +1,425 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Utility functions for Conv2d tests.
|
||||
"""
|
||||
|
||||
import torch
|
||||
|
||||
import cutlass
|
||||
from cutlass import (
|
||||
ConvKind,
|
||||
ConvMode,
|
||||
DataType,
|
||||
DataTypeNames,
|
||||
EpilogueScheduleSuffixes,
|
||||
KernelScheduleSuffixes,
|
||||
LayoutType,
|
||||
OpcodeClassNames,
|
||||
ShortDataTypeNames,
|
||||
ShortLayoutTypeNames,
|
||||
SplitKMode,
|
||||
)
|
||||
from cutlass.backend.utils.software import SubstituteTemplate
|
||||
from cutlass.shape import Conv2DProblemSize
|
||||
from cutlass.utils.datatypes import numpy_type, torch_type
|
||||
|
||||
from conv2d_problem_sizes import TestbedConv2dProblemSizes
|
||||
|
||||
|
||||
def get_name_conv2d(
|
||||
arch,
|
||||
conv_kind,
|
||||
element,
|
||||
element_accumulator,
|
||||
element_output,
|
||||
opclass,
|
||||
threadblock_shape,
|
||||
warp_count,
|
||||
instruction_shape,
|
||||
stages,
|
||||
iterator_algorithm,
|
||||
swizzle,
|
||||
split_k_mode,
|
||||
split_k_slices,
|
||||
activation
|
||||
):
|
||||
"""
|
||||
Generates a procedural name for a test case for conv2d
|
||||
|
||||
:param arch: compute capability of kernel being generated
|
||||
:type arch: int
|
||||
:param conv_kind: the convolution type (i.e. fprop, dgrad, wgrad)
|
||||
:type conv_kind: str
|
||||
:param iterator_algorithm: the iterator algorithm applied
|
||||
:type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
|
||||
:param element_a: data type of operand A
|
||||
:param element_b: data type of operand B
|
||||
:param element_c: data type of operand C
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpcodeClass
|
||||
:param threadblock_shape: indexable container of dimensions of threadblock tiles
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param stride_support: stride support of dgrad
|
||||
:param alignment: int
|
||||
:type alignment: int
|
||||
|
||||
:return: str
|
||||
"""
|
||||
if iterator_algorithm is None:
|
||||
iterator_algorithm = "AUTO"
|
||||
if swizzle is None:
|
||||
swizzle = 1
|
||||
name_format = "test_SM${arch}_Device_Conv2d_${conv_kind}_${iter_alg}_ImplicitGemm_${eA}nhwc_${eB}nhwc_${eC}nhwc_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${wM}x${wN}x${wK}_${IM}${IN}${IK}_stage${stages}_swizzle${swizzle}_${split_k_mode}${split_k_slices}_${activation}"
|
||||
|
||||
return SubstituteTemplate(
|
||||
name_format,
|
||||
{
|
||||
"arch": str(arch),
|
||||
"conv_kind": conv_kind,
|
||||
"iter_alg": iterator_algorithm,
|
||||
"eA": DataTypeNames[element],
|
||||
"eB": DataTypeNames[element],
|
||||
"eC": DataTypeNames[element_output],
|
||||
"opclass": opclass,
|
||||
"acc": DataTypeNames[element_accumulator],
|
||||
"tbM": str(threadblock_shape[0]),
|
||||
"tbN": str(threadblock_shape[1]),
|
||||
"tbK": str(threadblock_shape[2]),
|
||||
"wM": str(threadblock_shape[0] // warp_count[0]),
|
||||
"wN": str(threadblock_shape[1] // warp_count[1]),
|
||||
"wK": str(threadblock_shape[2] // warp_count[2]),
|
||||
"IM": str(instruction_shape[0]),
|
||||
"IN": str(instruction_shape[1]),
|
||||
"IK": str(instruction_shape[2]),
|
||||
"stages": str(stages),
|
||||
"swizzle": str(swizzle),
|
||||
"split_k_mode": split_k_mode,
|
||||
"split_k_slices": str(split_k_slices),
|
||||
"activation": activation
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def conv2d_few_channel_problemsizes(channels):
|
||||
problem_sizes = [
|
||||
Conv2DProblemSize(
|
||||
1, 8, 8, channels,
|
||||
16, 3, 3, channels,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 16, 16, channels,
|
||||
16, 3, 3, channels,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 16, 16, channels,
|
||||
16, 7, 7, channels,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 224, 224, channels,
|
||||
32, 7, 7, channels,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 224, 224, channels,
|
||||
64, 7, 7, channels,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 224, 224, channels,
|
||||
64, 5, 5, channels,
|
||||
1, 1,
|
||||
1, 1,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 224, 224, channels,
|
||||
64, 5, 5, channels,
|
||||
1, 1,
|
||||
2, 2,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
return problem_sizes
|
||||
|
||||
|
||||
def validate_problem_size(ps, conv_kind, split_k_slices):
|
||||
P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
|
||||
Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
|
||||
if P != ps.P or Q != ps.Q:
|
||||
return False
|
||||
|
||||
# Split-K (serial or parallel) is not supported for strided dgrad
|
||||
if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class Conv2dLauncherFrontend:
|
||||
def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
|
||||
self.operation = plan
|
||||
self.conv_kind = plan.conv_kind
|
||||
self.seed = seed
|
||||
self.backend = backend
|
||||
|
||||
self.dtype_A = plan._element_a
|
||||
self.dtype_B = plan._element_b
|
||||
self.dtype_C = plan._element_c
|
||||
self.dtype_acc = plan._element_accumulator
|
||||
self.layout_A = LayoutType.TensorNHWC
|
||||
self.layout_B = LayoutType.TensorNHWC
|
||||
self.layout_C = LayoutType.TensorNHWC
|
||||
self.layout_D = LayoutType.TensorNHWC
|
||||
|
||||
self.element_compute = DataType.f32
|
||||
|
||||
if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
|
||||
self.rand_max = 1
|
||||
else:
|
||||
self.rand_max = 4
|
||||
self.activation = plan.activation
|
||||
|
||||
def uniform_init(self, size, dtype):
|
||||
tensor = torch.ceil(
|
||||
torch.empty(size=size, dtype=torch_type(dtype), device="cuda").uniform_(-self.rand_max - 0.5, self.rand_max - 0.5)
|
||||
).to(memory_format=torch.channels_last)
|
||||
return tensor
|
||||
|
||||
def reference(self, ps, A, B, C, alpha, beta, activation):
|
||||
if self.conv_kind == ConvKind.Fprop:
|
||||
torch_result = alpha * torch.ops.aten.conv2d(
|
||||
A,
|
||||
B,
|
||||
stride=(ps.stride_h, ps.stride_w),
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
dilation=(ps.dilation_h, ps.dilation_w)
|
||||
) + beta * C
|
||||
elif self.conv_kind == ConvKind.Dgrad:
|
||||
torch_result = alpha * torch.nn.grad.conv2d_input(
|
||||
(ps.N, ps.C, ps.H, ps.W),
|
||||
B,
|
||||
A,
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
stride=(ps.stride_h, ps.stride_w)
|
||||
) + beta * C
|
||||
elif self.conv_kind == ConvKind.Wgrad:
|
||||
torch_result = alpha * torch.nn.grad.conv2d_weight(
|
||||
B,
|
||||
(ps.K, ps.C, ps.R, ps.S),
|
||||
A,
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
stride=(ps.stride_h, ps.stride_w)
|
||||
) + beta * C
|
||||
else:
|
||||
raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
|
||||
|
||||
if activation == cutlass.backend.epilogue.relu:
|
||||
torch_result = torch.nn.functional.relu(torch_result)
|
||||
elif activation == cutlass.backend.epilogue.leaky_relu:
|
||||
torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
|
||||
return torch_result
|
||||
|
||||
def run(self, ps, split_k_mode=SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
|
||||
if self.conv_kind == ConvKind.Fprop:
|
||||
tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
elif self.conv_kind == ConvKind.Dgrad:
|
||||
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
elif self.conv_kind == ConvKind.Wgrad:
|
||||
tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
|
||||
tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
|
||||
tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
|
||||
else:
|
||||
raise Exception(f"Conv kind {self.conv_kind} is not supported")
|
||||
|
||||
torch.manual_seed(self.seed)
|
||||
|
||||
tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
|
||||
tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
|
||||
tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
|
||||
tensor_D = torch.zeros_like(tensor_C).to(memory_format=torch.channels_last)
|
||||
self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
|
||||
stride=(ps.stride_h, ps.stride_w),
|
||||
padding=(ps.pad_h, ps.pad_w),
|
||||
dilation=(ps.dilation_h, ps.dilation_w),
|
||||
alpha=alpha, beta=beta,
|
||||
split_k=(split_k_mode, split_k_slices))
|
||||
|
||||
tensor_D_ref = self.reference(ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation)
|
||||
|
||||
torch.cuda.synchronize()
|
||||
passed = torch.equal(tensor_D, tensor_D_ref)
|
||||
|
||||
return passed
|
||||
|
||||
|
||||
def add_test(
|
||||
cls,
|
||||
cc,
|
||||
conv_kind,
|
||||
problem_sizes,
|
||||
element,
|
||||
element_accumulator,
|
||||
element_output,
|
||||
opclass,
|
||||
threadblock_shape,
|
||||
warp_count,
|
||||
instruction_shape,
|
||||
stages,
|
||||
iterator_algorithm=None,
|
||||
swizzle=None,
|
||||
split_k_mode="serial",
|
||||
split_k_slices=1,
|
||||
activation = "identity"
|
||||
):
|
||||
"""Create a test-running function with the given specification"""
|
||||
test_name = get_name_conv2d(
|
||||
cc, conv_kind, element, element_accumulator,
|
||||
element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
|
||||
iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
|
||||
|
||||
def run(self):
|
||||
# Create the plan
|
||||
plan = cutlass.Conv2d(
|
||||
kind=conv_kind,
|
||||
element=element,
|
||||
element_accumulator=element_accumulator,
|
||||
element_C=element_output,
|
||||
element_D=element_output
|
||||
)
|
||||
|
||||
# Set the opclass
|
||||
plan.opclass = opclass
|
||||
# Set the tile description
|
||||
td = {
|
||||
"threadblock_shape": threadblock_shape,
|
||||
"warp_count": warp_count,
|
||||
"stages": stages,
|
||||
"instruction_shape": instruction_shape,
|
||||
}
|
||||
|
||||
plan.tile_description = td
|
||||
# Set iterator algorithm
|
||||
if iterator_algorithm is not None:
|
||||
plan.iterator_algorithm = iterator_algorithm
|
||||
# Set swizzling functor
|
||||
if swizzle is not None:
|
||||
plan.swizzling_stride = swizzle
|
||||
|
||||
if activation != "identity":
|
||||
if activation == "leaky_relu":
|
||||
plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
|
||||
else:
|
||||
plan.activation = getattr(cutlass.epilogue, activation)
|
||||
|
||||
conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="torch")
|
||||
|
||||
for ps in problem_sizes:
|
||||
if not validate_problem_size(ps, conv_kind, split_k_slices): continue
|
||||
|
||||
self.assertTrue(conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 2.0))
|
||||
|
||||
setattr(cls, test_name, run)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
def get_conv_problems():
|
||||
# 64: minimum channel size
|
||||
conv_problems = TestbedConv2dProblemSizes(64).all
|
||||
|
||||
# Insert alignment 4 & 2 tests
|
||||
conv_problems += [
|
||||
Conv2DProblemSize(
|
||||
1, 4, 4, 12,
|
||||
8, 3, 3, 12,
|
||||
0, 0,
|
||||
3, 3,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 4, 4, 14,
|
||||
8, 3, 3, 14,
|
||||
0, 0,
|
||||
3, 3,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
Conv2DProblemSize(
|
||||
1, 23, 56, 98,
|
||||
128, 3, 3, 98,
|
||||
4, 5,
|
||||
3, 3,
|
||||
1, 1,
|
||||
ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
return conv_problems
|
||||
@ -1,6 +1,6 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
@ -30,13 +30,15 @@
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import cutlass.backend
|
||||
import pathlib
|
||||
import unittest
|
||||
from cutlass.backend.memory_manager import *
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**32, 2**32)
|
||||
loader = unittest.TestLoader()
|
||||
tests = loader.discover('./', 'conv2d_*.py')
|
||||
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
|
||||
tests = loader.discover(script_dir, 'conv2d_*.py')
|
||||
testRunner = unittest.runner.TextTestRunner()
|
||||
testRunner.run(tests)
|
||||
results = testRunner.run(tests)
|
||||
if not results.wasSuccessful():
|
||||
raise Exception('Test cases failed')
|
||||
@ -39,7 +39,6 @@ import tempfile
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
import cutlass_bindings
|
||||
|
||||
if cutlass.utils.datatypes.torch_available:
|
||||
import torch
|
||||
@ -94,7 +93,7 @@ def _generate_conv2d_problem(conv_kind, dtype, ps):
|
||||
:type conv_kind: str
|
||||
:param dtype: data type of tensors
|
||||
:param problem_size: the conv2d problem size
|
||||
:type problem_size: cutlass_bindings.conv.Conv2dProblemSize
|
||||
:type problem_size: cutlass.shape.Conv2DProblemSize
|
||||
|
||||
:return: initialized tensors A, B, C, and D
|
||||
:rtype: list
|
||||
@ -196,13 +195,11 @@ class PyTorchExtensionTest(unittest.TestCase):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
mod = cutlass.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
|
||||
|
||||
problem_size = cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
problem_size = cutlass.shape.Conv2DProblemSize(
|
||||
1, 4, 4, 16,
|
||||
8, 3, 3, 16,
|
||||
0, 0,
|
||||
3, 3,
|
||||
1, 1
|
||||
)
|
||||
|
||||
@ -239,13 +236,13 @@ class PyTorchExtensionTest(unittest.TestCase):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
mod = cutlass.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
|
||||
|
||||
problem_size = cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
problem_size = cutlass.shape.Conv2DProblemSize(
|
||||
1, 4, 4, 16,
|
||||
8, 3, 3, 16,
|
||||
0, 0,
|
||||
3, 3,
|
||||
1, 1,
|
||||
cutlass.ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
)
|
||||
|
||||
@ -273,13 +270,13 @@ class PyTorchExtensionTest(unittest.TestCase):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
mod = cutlass.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
|
||||
|
||||
problem_size = cutlass_bindings.conv.Conv2dProblemSize(
|
||||
cutlass_bindings.Tensor4DCoord(1, 4, 4, 16),
|
||||
cutlass_bindings.Tensor4DCoord(8, 3, 3, 16),
|
||||
cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass_bindings.MatrixCoord(3, 3),
|
||||
cutlass_bindings.MatrixCoord(1, 1),
|
||||
cutlass_bindings.conv.Mode.cross_correlation,
|
||||
problem_size = cutlass.shape.Conv2DProblemSize(
|
||||
1, 4, 4, 16,
|
||||
8, 3, 3, 16,
|
||||
0, 0,
|
||||
3, 3,
|
||||
1, 1,
|
||||
cutlass.ConvMode.CrossCorrelation,
|
||||
1, 1
|
||||
)
|
||||
|
||||
100
test/python/cutlass/evt/evt_compute_sm80_90.py
Normal file
100
test/python/cutlass/evt/evt_compute_sm80_90.py
Normal file
@ -0,0 +1,100 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
"""
|
||||
Unit test for compute node in SM90
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend import *
|
||||
from cutlass.epilogue import *
|
||||
from cutlass import swizzle
|
||||
|
||||
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
|
||||
class TestEVTComputeSM90(EVTTestCaseBase):
|
||||
|
||||
def test_arith(self):
|
||||
"""
|
||||
Test Arithmatic op
|
||||
"""
|
||||
def evt_arith_compute(accum, C, alpha, beta, gamma):
|
||||
D = ((accum + C) * alpha - gamma) / beta
|
||||
return D
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 1.5,
|
||||
"beta": 0.5,
|
||||
"gamma": 2.5,
|
||||
"D": self.fake_tensor(self.element, (l, m, n))
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_arith_compute, example_inputs)
|
||||
input_keys = ["C", "alpha", "beta", "gamma"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_func_call(self):
|
||||
"""
|
||||
Test Function call
|
||||
"""
|
||||
def evt_func_call(accum, C, alpha, beta, gamma):
|
||||
D = multiply_add(relu(accum + alpha) + C, beta, gamma)
|
||||
return D
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 1.5,
|
||||
"beta": 0.5,
|
||||
"gamma": 2.5,
|
||||
"D": self.fake_tensor(self.element, (l, m, n))
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_func_call, example_inputs)
|
||||
input_keys = ["C", "alpha", "beta", "gamma"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
173
test/python/cutlass/evt/evt_layout_sm80_90.py
Normal file
173
test/python/cutlass/evt/evt_layout_sm80_90.py
Normal file
@ -0,0 +1,173 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
"""
|
||||
Unit test for store nodes in SM90
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend import *
|
||||
from cutlass.epilogue import *
|
||||
|
||||
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
|
||||
class TestEVTLayoutSM90(EVTTestCaseBase):
|
||||
|
||||
def test_permute_1(self):
|
||||
"""
|
||||
Returning a tensor with shape [m, n]
|
||||
"""
|
||||
def evt_permute(accum, alpha, C):
|
||||
F = alpha * accum
|
||||
F_permute = permute(F, indices=(0, 2, 1))
|
||||
D_permute = F_permute + permute(C, indices=(0, 2, 1))
|
||||
D = permute(D_permute, indices=(0, 2, 1))
|
||||
return D, F
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 0.5,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_permute, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
@unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
|
||||
def test_permute_2(self):
|
||||
"""
|
||||
Returning a tensor with shape [m, n]
|
||||
"""
|
||||
def evt_permute(accum, alpha, C):
|
||||
F = alpha * accum
|
||||
F_permute = permute(F, indices=(0, 2, 1))
|
||||
D = F_permute + C
|
||||
return D, F
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 0.5,
|
||||
"C": self.fake_tensor(self.element, (l, n, m)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"D": self.fake_tensor(self.element, (l, n, m)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_permute, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
@unittest.skipIf(device_cc() == 80, "This unittest is for cc = Sm90 only")
|
||||
def test_permute_3(self):
|
||||
"""
|
||||
Returning a tensor with shape [m, n]
|
||||
"""
|
||||
def evt_permute(accum, alpha, C):
|
||||
F = alpha * accum
|
||||
F_permute = permute(F, indices=(1, 0, 2))
|
||||
D = F_permute + C
|
||||
return D, F
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 0.5,
|
||||
"C": self.fake_tensor(self.element, (m, l, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"D": self.fake_tensor(self.element, (m, l, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_permute, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_reshape(self):
|
||||
"""
|
||||
Test reshape
|
||||
"""
|
||||
def evt_reshape(accum, alpha, TensorE):
|
||||
F = alpha * accum
|
||||
E_reshape = reshape(TensorE, new_shape=(512, 1))
|
||||
D = F + E_reshape
|
||||
return D
|
||||
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
|
||||
"alpha": 0.5,
|
||||
"TensorE": self.fake_tensor(self.element, (16, 32)),
|
||||
"D": self.fake_tensor(self.element, (self.l, self.m, self.n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
|
||||
input_keys = ["alpha", "TensorE"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify(self.problem_size, input_keys, result_keys, self.l)
|
||||
|
||||
def test_reshape2(self):
|
||||
"""
|
||||
Test reshape
|
||||
"""
|
||||
def evt_reshape(accum, alpha, TensorE):
|
||||
F = alpha * accum
|
||||
F_reshape = reshape(F, new_shape=(2, 3, 512, 256))
|
||||
D = F_reshape + TensorE
|
||||
return D
|
||||
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
|
||||
"alpha": 0.5,
|
||||
"TensorE": self.fake_tensor(self.element, (2, 3, 1, self.n)),
|
||||
"D": self.fake_tensor(self.element, (2, 3, self.m, self.n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
|
||||
input_keys = ["alpha", "TensorE"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify(self.problem_size, input_keys, result_keys, self.l)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
142
test/python/cutlass/evt/evt_load_sm80_90.py
Normal file
142
test/python/cutlass/evt/evt_load_sm80_90.py
Normal file
@ -0,0 +1,142 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
"""
|
||||
Unit test for load nodes in SM90
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend import *
|
||||
from cutlass.epilogue import *
|
||||
|
||||
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
|
||||
class TestEVTLoadSM90(EVTTestCaseBase):
|
||||
|
||||
def test_tensor_load(self):
|
||||
"""
|
||||
Load extra tensor with shape [m, n]
|
||||
"""
|
||||
def evt_tensor_load(accum, C, aux, aux_batch):
|
||||
D = accum + C + aux + aux_batch
|
||||
return D
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"aux": self.fake_tensor(self.element, (m, n)),
|
||||
"aux_batch": self.fake_tensor(np.float32, (l, m, n)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_tensor_load, example_inputs)
|
||||
input_keys = ["C", "aux", "aux_batch"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_row_broadcast(self):
|
||||
"""
|
||||
Load extra tensor with shape [1, n]
|
||||
"""
|
||||
def evt_row_broadcast(accum, C, bias, bias_batch):
|
||||
D = accum + C + bias + bias_batch
|
||||
return D
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"bias": self.fake_tensor(self.element, (n,)),
|
||||
"bias_batch": self.fake_tensor(np.float32, (l, 1, n)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_row_broadcast, example_inputs)
|
||||
input_keys = ["C", "bias", "bias_batch"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_column_broadcast(self):
|
||||
"""
|
||||
Load extra tensor with shape [m, 1]
|
||||
"""
|
||||
def evt_column_broadcast(accum, C, bias, bias_batch):
|
||||
D = accum + C + bias + bias_batch
|
||||
return D
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"bias": self.fake_tensor(self.element, (m, 1)),
|
||||
"bias_batch": self.fake_tensor(np.float32, (l, m, 1)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_column_broadcast, example_inputs)
|
||||
input_keys = ["C", "bias", "bias_batch"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_scalar_broadcast(self):
|
||||
"""
|
||||
Load extra tensor with shape [1, 1]
|
||||
"""
|
||||
def evt_scalar_broadcast(accum, C, alpha, alpha_batch):
|
||||
D = accum + C + alpha + alpha_batch
|
||||
return D
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 0.5,
|
||||
"alpha_batch": self.fake_tensor(np.float32, (l, 1, 1)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_scalar_broadcast, example_inputs)
|
||||
input_keys = ["C", "alpha", "alpha_batch"]
|
||||
result_keys = ["D"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
274
test/python/cutlass/evt/evt_mixed_sm80_90.py
Normal file
274
test/python/cutlass/evt/evt_mixed_sm80_90.py
Normal file
@ -0,0 +1,274 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
"""
|
||||
Unittest for mixed types of nodes in SM90
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend import *
|
||||
from cutlass.epilogue import *
|
||||
from cutlass.swizzle import ThreadblockSwizzleStreamK
|
||||
|
||||
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
|
||||
class TestEVTMixedSM90(EVTTestCaseBase):
|
||||
def test_mixed_dag(self):
|
||||
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
|
||||
F = alpha * accum + (beta * C + aux)
|
||||
F_row_max = max(F, dim=[0, 1])
|
||||
E = relu(F + 1) + cbias + rbias
|
||||
E_col_max = max(E, dim=[0, 2])
|
||||
D = E + F
|
||||
return D, F, F_row_max, E_col_max
|
||||
|
||||
if device_cc() == 80:
|
||||
aligments = [2, 4, 8]
|
||||
else:
|
||||
# Sm90 EVT currently only supports 128-bit alignment
|
||||
aligments = [8,]
|
||||
for align in aligments:
|
||||
for m, n, k, l in self.get_problem_sizes(align):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(self.element, (l, m, n)),
|
||||
"cbias": self.fake_tensor(self.element, (m, 1)),
|
||||
"rbias": self.fake_tensor(self.element, (n,)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
|
||||
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs)
|
||||
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
|
||||
result_keys = ["D", "F", "F_row_max", "E_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
|
||||
def test_mixed_dag_float(self):
|
||||
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
|
||||
F = alpha * accum + (beta * C + aux)
|
||||
F_row_max = max(F, dim=[0, 1])
|
||||
E = relu(F + 1) + cbias + rbias
|
||||
E_col_max = max(E, dim=[0, 2])
|
||||
D = E + F
|
||||
return D, F, F_row_max, E_col_max
|
||||
|
||||
for align in [3, 2, 4]:
|
||||
for m, n, k, l in self.get_problem_sizes(align):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(np.float32, (l, m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(np.float32, (l, m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(np.float32, (l, m, n)),
|
||||
"cbias": self.fake_tensor(np.float32, (m, 1)),
|
||||
"rbias": self.fake_tensor(np.float32, (n,)),
|
||||
"D": self.fake_tensor(np.float32, (l, m, n)),
|
||||
"F": self.fake_tensor(np.float32, (l, m, n)),
|
||||
"F_row_max": self.fake_tensor(np.float32, (n,)),
|
||||
"E_col_max": self.fake_tensor(np.float32, (m, 1))
|
||||
}
|
||||
launcher = EVTTestBed(DataType.f32, evt_mixed_dag, example_inputs)
|
||||
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
|
||||
result_keys = ["D", "F", "F_row_max", "E_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
|
||||
def test_mixed_dag_stage2(self):
|
||||
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
|
||||
F = alpha * accum + (beta * C + aux)
|
||||
F_row_max = max(F, dim=[0, 1])
|
||||
E = relu(F + 1) + cbias + rbias
|
||||
E_col_max = max(E, dim=[0, 2])
|
||||
D = E + F
|
||||
return D, F, F_row_max, E_col_max
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(self.element, (l, m, n)),
|
||||
"cbias": self.fake_tensor(self.element, (m, 1)),
|
||||
"rbias": self.fake_tensor(self.element, (n,)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
|
||||
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, epilogue_stages=2)
|
||||
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
|
||||
result_keys = ["D", "F", "F_row_max", "E_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
|
||||
def test_mixed_dag_partition_k(self):
|
||||
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
|
||||
F = alpha * accum + (beta * C + aux)
|
||||
F_row_max = max(F, dim=[0, 1])
|
||||
E = relu(F + 1) + cbias + rbias
|
||||
E_col_max = max(E, dim=[0, 2])
|
||||
D = E + F
|
||||
return D, F, F_row_max, E_col_max
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(self.element, (l, m, n)),
|
||||
"cbias": self.fake_tensor(self.element, (m, 1)),
|
||||
"rbias": self.fake_tensor(self.element, (n,)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
|
||||
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
|
||||
}
|
||||
|
||||
tile_description = {
|
||||
"threadblock_shape": [128, 128, 64],
|
||||
"warp_count": [2, 2, 2]
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, tile_description=tile_description, epilogue_stages=2)
|
||||
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
|
||||
result_keys = ["D", "F", "F_row_max", "E_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
@unittest.skipIf(device_cc() != 80, "This unittest is for cc = Sm80 only")
|
||||
def test_mixed_dag_stream_k(self):
|
||||
def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
|
||||
F = alpha * accum + (beta * C + aux)
|
||||
F_row_max = max(F, dim=[0, 1])
|
||||
E = relu(F + 1) + cbias + rbias
|
||||
E_col_max = max(E, dim=[0, 2])
|
||||
D = E + F
|
||||
return D, F, F_row_max, E_col_max
|
||||
|
||||
# High per-sm occupancy tile_description
|
||||
tile_description = {
|
||||
"threadblock_shape": [128, 128, 32],
|
||||
"warp_count": [2, 2, 1],
|
||||
"stages": 3
|
||||
}
|
||||
tds = [None, tile_description]
|
||||
for td in tds:
|
||||
for m, n, k, l in self.get_problem_sizes(8, k=960, batch_count=[1, 3]):
|
||||
if l == 1:
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(self.element, (m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(self.element, (m, n)),
|
||||
"cbias": self.fake_tensor(self.element, (m, 1)),
|
||||
"rbias": self.fake_tensor(self.element, (n,)),
|
||||
"D": self.fake_tensor(self.element, (m, n)),
|
||||
"F": self.fake_tensor(self.element, (m, n)),
|
||||
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
|
||||
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
|
||||
}
|
||||
else:
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(self.element, (l, m, n)),
|
||||
"cbias": self.fake_tensor(self.element, (m, 1)),
|
||||
"rbias": self.fake_tensor(self.element, (n,)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
|
||||
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
|
||||
}
|
||||
|
||||
if td is not None:
|
||||
launcher = EVTTestBed(
|
||||
self.element, evt_mixed_dag, example_inputs,
|
||||
tile_description=td,
|
||||
swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
|
||||
else:
|
||||
launcher = EVTTestBed(
|
||||
self.element, evt_mixed_dag, example_inputs,
|
||||
swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
|
||||
|
||||
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
|
||||
result_keys = ["D", "F", "F_row_max", "E_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_mixed_dag_no_batch(self):
|
||||
def evt_mixed_dag_no_batch(accum, alpha, C, beta, aux, cbias, rbias):
|
||||
F = alpha * accum + (beta * C + aux)
|
||||
F_row_max = max(F, dim=[0, 1])
|
||||
E = relu(F + 1) + cbias + rbias
|
||||
E_col_max = max(E, dim=[0, 2])
|
||||
D = E + F
|
||||
return D, F, F_row_max, E_col_max
|
||||
|
||||
for m, n, k, _ in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (m, n)),
|
||||
"alpha": 1.0,
|
||||
"C": self.fake_tensor(self.element, (m, n)),
|
||||
"beta": 1.0,
|
||||
"aux": self.fake_tensor(self.element, (m, n)),
|
||||
"cbias": self.fake_tensor(self.element, (m, 1)),
|
||||
"rbias": self.fake_tensor(self.element, (n,)),
|
||||
"D": self.fake_tensor(self.element, (m, n)),
|
||||
"F": self.fake_tensor(self.element, (m, n)),
|
||||
"F_row_max": self.fake_tensor(DataType.f32, (n,)),
|
||||
"E_col_max": self.fake_tensor(DataType.f32, (m, 1))
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_mixed_dag_no_batch, example_inputs)
|
||||
input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
|
||||
result_keys = ["D", "F", "F_row_max", "E_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, 1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
155
test/python/cutlass/evt/evt_store_sm80_90.py
Normal file
155
test/python/cutlass/evt/evt_store_sm80_90.py
Normal file
@ -0,0 +1,155 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
"""
|
||||
Unit test for store nodes in SM90
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass.backend import *
|
||||
from cutlass.epilogue import *
|
||||
|
||||
from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
|
||||
class TestEVTStoreSM90(EVTTestCaseBase):
|
||||
|
||||
def test_aux_store(self):
|
||||
"""
|
||||
Returning a tensor with shape [m, n]
|
||||
"""
|
||||
def evt_aux_store(accum, alpha, C):
|
||||
F = alpha * accum
|
||||
D = F + C
|
||||
return D, F
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 0.5,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F": self.fake_tensor(self.element, (l, m, n)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_col_reduce(self):
|
||||
"""
|
||||
Reduction [m, n] -> [m, 1]
|
||||
"""
|
||||
def evt_row_reduce(accum, alpha, C):
|
||||
acc_row_max = max(accum, dim=[2,])
|
||||
F = alpha * accum
|
||||
F_row_max = max(F, dim=[0, 2])
|
||||
D = F + C
|
||||
return D, F_row_max, acc_row_max
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 2.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F_row_max": self.fake_tensor(np.float32, (m, 1)),
|
||||
"acc_row_max": self.fake_tensor(np.float32, (l, m, 1)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_row_reduce, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F_row_max", "acc_row_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_row_reduce(self):
|
||||
"""
|
||||
Reduction [m, n] -> [n]
|
||||
"""
|
||||
def evt_col_reduce(accum, alpha, C):
|
||||
acc_col_max = max(accum, dim=[1,])
|
||||
F = alpha * accum
|
||||
F_col_max = max(F, dim=[0, 1])
|
||||
D = F + C
|
||||
return D, F_col_max, acc_col_max
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 2.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"F_col_max": self.fake_tensor(np.float32, (n,)),
|
||||
"acc_col_max": self.fake_tensor(np.float32, (l, 1, n)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_col_reduce, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F_col_max", "acc_col_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
def test_scalar_reduce(self):
|
||||
"""
|
||||
Reduction [m, n] -> [1,]
|
||||
"""
|
||||
def evt_scalar_reduce(accum, alpha, C):
|
||||
acc_max = max(accum, dim=[1, 2])
|
||||
F = alpha * accum
|
||||
F_max = max(F, dim=[0, 1, 2])
|
||||
D = F + C
|
||||
return D, F_max, acc_max
|
||||
|
||||
for m, n, k, l in self.get_problem_sizes(8):
|
||||
example_inputs = {
|
||||
"accum": self.fake_tensor(self.element, (l, m, n)),
|
||||
"alpha": 2.0,
|
||||
"C": self.fake_tensor(self.element, (l, m, n)),
|
||||
"acc_max": self.fake_tensor(np.float32, (l, 1, 1)),
|
||||
"F_max": self.fake_tensor(np.float32, (1,)),
|
||||
"D": self.fake_tensor(self.element, (l, m, n)),
|
||||
}
|
||||
|
||||
launcher = EVTTestBed(self.element, evt_scalar_reduce, example_inputs)
|
||||
input_keys = ["C", "alpha"]
|
||||
result_keys = ["D", "F_max", "acc_max"]
|
||||
launcher.verify((m, n, k), input_keys, result_keys, l)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@ -30,12 +30,14 @@
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import pathlib
|
||||
import unittest
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
loader = unittest.TestLoader()
|
||||
tests = loader.discover('./', 'gemm_*.py')
|
||||
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
|
||||
tests = loader.discover(script_dir, 'evt_*.py')
|
||||
testRunner = unittest.runner.TextTestRunner()
|
||||
results = testRunner.run(tests)
|
||||
if not results.wasSuccessful():
|
||||
230
test/python/cutlass/evt/utils/evt_testbed.py
Normal file
230
test/python/cutlass/evt/utils/evt_testbed.py
Normal file
@ -0,0 +1,230 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 20123 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
"""
|
||||
Testbed classes of EVT
|
||||
"""
|
||||
|
||||
import torch
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass import Tensor
|
||||
import cutlass.backend.evt
|
||||
from cutlass.profiler import CUDAEventProfiler
|
||||
from cutlass.shape import GemmCoord
|
||||
from cutlass.utils.datatypes import torch_type
|
||||
|
||||
|
||||
class EVTReferenceModule:
|
||||
def __init__(self, layout_A, layout_B, layout_C, epilogue_visitor):
|
||||
self.layout_A = layout_A
|
||||
self.layout_B = layout_B
|
||||
self.layout_C = layout_C
|
||||
self.epilogue_visitor = epilogue_visitor
|
||||
|
||||
def run(self, A, B, C, problem_size, alpha, beta, batch=1):
|
||||
if self.layout_A == cutlass.LayoutType.RowMajor:
|
||||
A_row = A.view((batch, problem_size.m, problem_size.k))
|
||||
else:
|
||||
A_col = A.view((batch, problem_size.k, problem_size.m))
|
||||
A_row = torch.permute(A_col, (0, 2, 1))
|
||||
|
||||
if self.layout_B == cutlass.LayoutType.RowMajor:
|
||||
B_row = B.view((batch, problem_size.k, problem_size.n))
|
||||
else:
|
||||
B_col = B.view((batch, problem_size.n, problem_size.k))
|
||||
B_row = torch.permute(B_col, (0, 2, 1))
|
||||
|
||||
if self.layout_C == cutlass.LayoutType.RowMajor:
|
||||
C_row = C.view((batch, problem_size.m, problem_size.n))
|
||||
else:
|
||||
C_col = C.view((batch, problem_size.n, problem_size.m))
|
||||
C_row = torch.permute(C_col, (0, 2, 1))
|
||||
|
||||
out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
|
||||
|
||||
if self.layout_C == cutlass.LayoutType.ColumnMajor:
|
||||
out = torch.permute(out_row, (0, 2, 1))
|
||||
else:
|
||||
out = out_row
|
||||
|
||||
return torch.flatten(out)
|
||||
|
||||
def __call__(self, A, B, C, problem_size, batch=1, epilogue_args=None):
|
||||
# Running the mainloop
|
||||
accum = self.run(
|
||||
A, B, C, problem_size, 1.0, 0.0, batch=batch
|
||||
).reshape(batch, problem_size.m, problem_size.n)
|
||||
|
||||
# Running the epilogue
|
||||
epilogue_args["accum"] = accum
|
||||
references = self.epilogue_visitor(**epilogue_args)
|
||||
|
||||
# Return the results
|
||||
if not isinstance(references, tuple):
|
||||
references = (references,)
|
||||
return references
|
||||
|
||||
|
||||
class EVTTestBed:
|
||||
"""
|
||||
Epilogue Visitor Testbed
|
||||
"""
|
||||
def __init__(self, element, evt_fn, example_inputs, profile=False, **kwargs) -> None:
|
||||
self.element = element
|
||||
layout = cutlass.LayoutType.RowMajor
|
||||
self.example_inputs = example_inputs
|
||||
|
||||
# Create the Gemm plan
|
||||
self.plan = cutlass.op.Gemm(element=element, layout=layout, element_accumulator=torch.float32)
|
||||
|
||||
if "tile_description" in kwargs:
|
||||
self.plan.tile_description = kwargs["tile_description"]
|
||||
|
||||
if "swizzling_functor" in kwargs:
|
||||
self.plan.swizzling_functor = kwargs["swizzling_functor"]
|
||||
|
||||
# Compile the epilogue visitor
|
||||
epilogue_visitor = cutlass.epilogue.trace(evt_fn, example_inputs)
|
||||
if "epilogue_stages" in kwargs:
|
||||
epilogue_visitor.epilogue_stages = kwargs["epilogue_stages"]
|
||||
self.plan.epilogue_visitor = epilogue_visitor
|
||||
|
||||
# Reference model
|
||||
self.reference_fn = EVTReferenceModule(layout, layout, layout, epilogue_visitor)
|
||||
|
||||
self.profile = profile
|
||||
|
||||
def get_torch_tensor(self, shape, dtype=None, fill=None):
|
||||
if dtype is None:
|
||||
dtype = self.element
|
||||
|
||||
dtype = torch_type(dtype)
|
||||
if fill is None:
|
||||
return torch.ceil(
|
||||
torch.empty(size=shape, dtype=dtype, device="cuda").uniform_(-4.5, 3.5)
|
||||
)
|
||||
else:
|
||||
return torch.full(shape, fill, dtype=dtype, device="cuda")
|
||||
|
||||
def verify(self, problem_size, input_keys, result_keys, batch_count=1):
|
||||
"""
|
||||
Verify the results
|
||||
"""
|
||||
problem_size = GemmCoord(*problem_size)
|
||||
|
||||
# Initiate the GEMM arguments
|
||||
tensor_A = self.get_torch_tensor((batch_count, problem_size.m, problem_size.k))
|
||||
tensor_B = self.get_torch_tensor((batch_count, problem_size.k, problem_size.n))
|
||||
|
||||
# Initialize the epilogue args
|
||||
epilogue_args = {}
|
||||
for key in self.example_inputs.keys():
|
||||
if key in input_keys:
|
||||
tensor = self.example_inputs[key]
|
||||
if isinstance(tensor, Tensor):
|
||||
epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element)
|
||||
else:
|
||||
epilogue_args[key] = tensor
|
||||
elif key in result_keys:
|
||||
tensor = self.example_inputs[key]
|
||||
if isinstance(tensor, Tensor):
|
||||
if "max" in key:
|
||||
fill = -1000
|
||||
else:
|
||||
fill = 0
|
||||
epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element, fill=fill)
|
||||
else:
|
||||
epilogue_args[key] = tensor
|
||||
|
||||
tensor_D = epilogue_args["D"]
|
||||
if "C" in epilogue_args:
|
||||
tensor_C = epilogue_args["C"]
|
||||
else:
|
||||
tensor_C = tensor_D
|
||||
# Run the device kernel
|
||||
self.plan.run(tensor_A, tensor_B, tensor_C, tensor_D, visitor_args=epilogue_args)
|
||||
|
||||
# Run the host reference
|
||||
evt_args_inputs = {}
|
||||
for key in input_keys:
|
||||
evt_args_inputs[key] = epilogue_args[key]
|
||||
|
||||
reference_results = self.reference_fn(
|
||||
tensor_A, tensor_B, tensor_C, problem_size, batch_count, evt_args_inputs)
|
||||
|
||||
# Compare the results
|
||||
for result, ref in zip(result_keys, reference_results):
|
||||
assert torch.equal(epilogue_args[result].flatten(), ref.flatten())
|
||||
|
||||
# Run profile
|
||||
if self.profile:
|
||||
profiler = CUDAEventProfiler(
|
||||
self.plan, 100, 100, tensor_A, tensor_B, tensor_C, tensor_D,
|
||||
visitor_args = epilogue_args
|
||||
)
|
||||
print(f"Cutlass Python Duration: {profiler()}")
|
||||
|
||||
|
||||
class EVTTestCaseBase(unittest.TestCase):
|
||||
"""
|
||||
Base class for EVT Unittest
|
||||
"""
|
||||
def __init__(self, methodName: str = "runTest", lmnk=(6, 512, 256, 128)) -> None:
|
||||
super().__init__(methodName)
|
||||
|
||||
self.element = cutlass.DataType.f16
|
||||
self.l, self.m, self.n, self.k = lmnk
|
||||
|
||||
self.problem_size = (self.m, self.n, self.k)
|
||||
|
||||
torch.random.manual_seed(42)
|
||||
|
||||
def fake_tensor(self, element, shape):
|
||||
return Tensor(element=element, shape=shape, layout_tag=cutlass.LayoutType.RowMajor)
|
||||
|
||||
def get_problem_sizes(self, alignment, k=None, batch_count=[3,]):
|
||||
k = k if k else self.k
|
||||
problem_size_m = [alignment, 512 - 3 * alignment]
|
||||
problem_size_n = [alignment, 512 - alignment]
|
||||
if alignment % 8 == 0:
|
||||
problem_size_m.append(768)
|
||||
problem_size_n.append(768)
|
||||
problem_size_l = batch_count
|
||||
problem_sizes = []
|
||||
for m in problem_size_m:
|
||||
for n in problem_size_n:
|
||||
for l in problem_size_l:
|
||||
problem_sizes.append((m, n, k, l))
|
||||
|
||||
return problem_sizes
|
||||
@ -35,15 +35,15 @@ High-level tests for running batched GEMMs
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
from math import prod
|
||||
|
||||
import cutlass
|
||||
import logging
|
||||
import torch
|
||||
from math import prod
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
import torch
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
|
||||
@ -130,10 +130,5 @@ class GemmF16Batched(unittest.TestCase):
|
||||
self.run_batched((3,), False, True, False)
|
||||
self.run_batched((2, 3), False, True, False)
|
||||
|
||||
def test_batched_C(self):
|
||||
self.run_batched((3,), False, False, True)
|
||||
self.run_batched((2, 3), False, False, True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F16 operands on SM80
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F16 operands on SM90
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 90
|
||||
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F32 operands on SM80
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F64 operands on SM80
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with F64 operands on SM90
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 90
|
||||
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with S8 operands on SM80
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 80
|
||||
@ -35,14 +35,14 @@ Low-level functionality tests for GEMM with S8 operands on SM90
|
||||
"""
|
||||
|
||||
from functools import partial
|
||||
|
||||
import cutlass
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from cutlass.backend.test.utils import LayoutCombination, add_test_gemm
|
||||
import cutlass
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
|
||||
from utils import LayoutCombination, add_test_gemm
|
||||
|
||||
|
||||
cutlass.set_log_level(logging.WARNING)
|
||||
cc = 90
|
||||
387
test/python/cutlass/gemm/gemm_testbed.py
Normal file
387
test/python/cutlass/gemm/gemm_testbed.py
Normal file
@ -0,0 +1,387 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
from math import prod
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
import torch
|
||||
|
||||
from cutlass import (
|
||||
DataType,
|
||||
DataTypeSize,
|
||||
GemmUniversalMode,
|
||||
LayoutType,
|
||||
OpcodeClass,
|
||||
ShortDataTypeNames,
|
||||
SwizzlingFunctor
|
||||
)
|
||||
|
||||
from cutlass.backend import compiler
|
||||
from cutlass.backend.gemm_operation import GemmArguments, GemmOperationUniversal
|
||||
from cutlass.backend.memory_manager import get_allocated_size
|
||||
from cutlass.backend.reduction_operation import ReductionArguments, ReductionOperation
|
||||
from cutlass.shape import GemmCoord, MatrixCoord
|
||||
from cutlass.utils.datatypes import torch_type
|
||||
|
||||
|
||||
class GemmUniversalLauncher:
|
||||
def __init__(
|
||||
self,
|
||||
operation,
|
||||
seed=2080,
|
||||
verification=True,
|
||||
iterations=500,
|
||||
compiler_mode= "nvcc",
|
||||
**kwargs,
|
||||
) -> None:
|
||||
# Create the reduction kernel, if needed
|
||||
self.reduction_operation: ReductionOperation = ReductionOperation(
|
||||
shape=MatrixCoord(4, 32 * operation.C.alignment),
|
||||
C=operation.C,
|
||||
element_accumulator=operation.tile_description.math_instruction.element_accumulator,
|
||||
element_compute=operation.epilogue_functor.element_epilogue,
|
||||
epilogue_functor=operation.epilogue_functor,
|
||||
count=operation.C.alignment,
|
||||
)
|
||||
|
||||
self.math_operation = operation.tile_description.math_instruction.math_operation
|
||||
self.verification = verification
|
||||
|
||||
if compiler_mode == "nvcc":
|
||||
compiler.nvcc()
|
||||
elif compiler_mode == "nvrtc":
|
||||
compiler.nvrtc()
|
||||
else:
|
||||
raise Exception(f"Unexpected compiler string {compiler_mode}")
|
||||
|
||||
op_list = [operation]
|
||||
if operation.arch < 90:
|
||||
# Split K via Python is currently only supported for pre-SM90 kernels
|
||||
op_list.append(self.reduction_operation)
|
||||
|
||||
compiler.add_module(op_list, bypass_cache=False)
|
||||
|
||||
self.operation = operation
|
||||
|
||||
self.dtype_A = torch_type(operation.A.element)
|
||||
self.dtype_B = torch_type(operation.B.element)
|
||||
self.dtype_C = torch_type(operation.C.element)
|
||||
self.dtype_D = torch_type(operation.C.element)
|
||||
|
||||
accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
|
||||
element_size = DataTypeSize[operation.A.element]
|
||||
|
||||
if element_size == 1:
|
||||
self.rand_max = 1
|
||||
self.rand_min = 0
|
||||
elif element_size <= 8:
|
||||
self.rand_max = 1
|
||||
self.rand_min = -1
|
||||
elif element_size == 16:
|
||||
self.rand_max = 4
|
||||
self.rand_min = -4
|
||||
else:
|
||||
self.rand_max = 8
|
||||
self.rand_min = -8
|
||||
|
||||
self.seed = seed
|
||||
|
||||
self.compute_type = operation.epilogue_functor.element_epilogue
|
||||
self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
|
||||
|
||||
def print_problem_size(self, p, mode, batch_count):
|
||||
if mode == GemmUniversalMode.Gemm:
|
||||
mode = "Gemm"
|
||||
elif mode == GemmUniversalMode.Batched:
|
||||
mode = "GemmBatched"
|
||||
elif mode == GemmUniversalMode.GemmSplitKParallel:
|
||||
mode = "GemmSplitKParallel"
|
||||
print(f"problem: {p.m}, {p.n}, {p.k}\n batch_count: {batch_count}\n mode: {mode}")
|
||||
|
||||
def uniform_init(self, shape, dtype, layout):
|
||||
size = prod(shape)
|
||||
if dtype.is_floating_point:
|
||||
data = torch.ceil(torch.empty(size=(size,), dtype=dtype, device="cuda").uniform_(self.rand_min - 0.5, self.rand_max - 0.5))
|
||||
else:
|
||||
# PyTorch does not currently support integer-typed matrix multiplications on GPU.
|
||||
# Fall back to CPU for integer type references.
|
||||
data = torch.empty(size=(size,), dtype=dtype, device="cpu").random_(self.rand_min, self.rand_max + 1)
|
||||
|
||||
if dtype == torch.float64 or dtype == torch.float32:
|
||||
data = data.to("cpu")
|
||||
|
||||
data_ref = data.reshape(shape)
|
||||
|
||||
if layout == LayoutType.RowMajor:
|
||||
data_cutlass = data_ref
|
||||
else:
|
||||
data_cutlass = data_ref.transpose(-1, -2).contiguous()
|
||||
|
||||
data_cutlass = data_cutlass.to("cuda")
|
||||
return data_cutlass, data_ref
|
||||
|
||||
def reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
|
||||
# If any tensor is on CPU, place all tensors on CPU unless only
|
||||
# tensor C is on CPU
|
||||
devices = [x.device.type for x in [tensor_A, tensor_B, tensor_C]]
|
||||
if "cpu" in devices and devices != ["cuda", "cuda", "cpu"]:
|
||||
device = torch.device("cpu")
|
||||
else:
|
||||
device = tensor_A.device
|
||||
|
||||
tensor_A = tensor_A.to(device)
|
||||
tensor_B = tensor_B.to(device)
|
||||
tensor_C = tensor_C.to(device)
|
||||
|
||||
dtype = torch_type(self.compute_type)
|
||||
alpha_torch = torch.tensor([alpha], device=device).to(dtype)
|
||||
beta_torch = torch.tensor([beta], device=device).to(dtype)
|
||||
|
||||
tmp = tensor_A @ tensor_B
|
||||
tensor_D_ref = (alpha_torch * tmp) + (tensor_C * beta_torch)
|
||||
return tensor_D_ref.to(self.dtype_D)
|
||||
|
||||
def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
|
||||
torch.random.manual_seed(self.seed)
|
||||
|
||||
# Assign an actual batch count in cases where we are not running in batched mode.
|
||||
# This is to differentiate between the number of split K slices and the batch count,
|
||||
# which are overloaded within the single `batch_count` variable.
|
||||
if mode == GemmUniversalMode.Batched:
|
||||
true_batch_count = batch_count
|
||||
else:
|
||||
true_batch_count = 1
|
||||
|
||||
def transpose(layout):
|
||||
if layout == LayoutType.RowMajor:
|
||||
return LayoutType.ColumnMajor
|
||||
else:
|
||||
return LayoutType.RowMajor
|
||||
|
||||
tensor_A, tensor_A_ref = self.uniform_init(
|
||||
(true_batch_count, problem_size.m, problem_size.k),
|
||||
self.dtype_A,
|
||||
self.operation.A.layout if not self.operation.switched else transpose(self.operation.B.layout),
|
||||
)
|
||||
tensor_B, tensor_B_ref = self.uniform_init(
|
||||
(true_batch_count, problem_size.k, problem_size.n),
|
||||
self.dtype_B,
|
||||
self.operation.B.layout if not self.operation.switched else transpose(self.operation.A.layout),
|
||||
)
|
||||
tensor_C, tensor_C_ref = self.uniform_init(
|
||||
(true_batch_count, problem_size.m, problem_size.n),
|
||||
self.dtype_C,
|
||||
self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
|
||||
)
|
||||
tensor_D = torch.zeros_like(tensor_C)
|
||||
|
||||
if self.compute_type in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
|
||||
alpha = int(alpha)
|
||||
beta = int(beta)
|
||||
|
||||
#
|
||||
# Launch kernel
|
||||
#
|
||||
|
||||
arguments = GemmArguments(
|
||||
operation=self.operation,
|
||||
problem_size=problem_size,
|
||||
A=tensor_A,
|
||||
B=tensor_B,
|
||||
C=tensor_C,
|
||||
D=tensor_D,
|
||||
output_op=self.operation.epilogue_type(alpha, beta),
|
||||
gemm_mode=mode,
|
||||
split_k_slices=split_k_slices,
|
||||
batch=batch_count,
|
||||
)
|
||||
|
||||
if mode == GemmUniversalMode.GemmSplitKParallel:
|
||||
reduction_arguments = ReductionArguments(
|
||||
self.reduction_operation,
|
||||
problem_size=[problem_size.m, problem_size.n],
|
||||
partitions=split_k_slices,
|
||||
workspace=arguments.ptr_D,
|
||||
destination=tensor_D,
|
||||
source=tensor_C,
|
||||
output_op=self.reduction_operation.epilogue_type(alpha, beta),
|
||||
)
|
||||
|
||||
self.operation.run(arguments)
|
||||
|
||||
if mode == GemmUniversalMode.GemmSplitKParallel:
|
||||
self.reduction_operation.run(reduction_arguments)
|
||||
|
||||
passed = True
|
||||
|
||||
if self.verification:
|
||||
if mode == GemmUniversalMode.GemmSplitKParallel:
|
||||
reduction_arguments.sync()
|
||||
else:
|
||||
arguments.sync()
|
||||
tensor_D_ref = self.reference(
|
||||
problem_size,
|
||||
tensor_A_ref,
|
||||
tensor_B_ref,
|
||||
tensor_C_ref,
|
||||
alpha,
|
||||
beta,
|
||||
)
|
||||
|
||||
tensor_D_ref = tensor_D_ref.to('cuda')
|
||||
|
||||
if self.operation.switched or self.operation.C.layout == LayoutType.ColumnMajor:
|
||||
tensor_D = tensor_D.transpose(-1, -2).contiguous()
|
||||
|
||||
passed = tensor_D.equal(tensor_D_ref)
|
||||
|
||||
try:
|
||||
assert passed
|
||||
except AssertionError:
|
||||
self.print_problem_size(problem_size, mode, batch_count)
|
||||
del arguments
|
||||
if mode == GemmUniversalMode.GemmSplitKParallel:
|
||||
del reduction_arguments
|
||||
|
||||
cur_size = get_allocated_size()
|
||||
assert cur_size == 0, f"{cur_size} B of memory were not released after this run"
|
||||
|
||||
return passed
|
||||
|
||||
|
||||
def test_all_gemm(operation: "GemmOperationUniversal", testcase="universal", compilation_mode="nvcc"):
|
||||
passed = True
|
||||
|
||||
minimum_operand_element_size = min(
|
||||
DataTypeSize[operation.A.element], DataTypeSize[operation.B.element]
|
||||
)
|
||||
opcode_class = operation.tile_description.math_instruction.opcode_class
|
||||
|
||||
if opcode_class == OpcodeClass.Simt:
|
||||
alignment = 1
|
||||
else:
|
||||
alignment = 128 // minimum_operand_element_size
|
||||
|
||||
alignment_m = alignment
|
||||
alignment_n = alignment
|
||||
alignment_k = alignment
|
||||
|
||||
# INT8 alignment constraints
|
||||
if opcode_class == OpcodeClass.Simt:
|
||||
A_is_s8 = operation.A.element == DataType.s8
|
||||
B_is_s8 = operation.B.element == DataType.s8
|
||||
|
||||
if A_is_s8 and operation.A.layout == LayoutType.ColumnMajor:
|
||||
alignment_m = 4
|
||||
if B_is_s8 == DataType.s8 and operation.A.layout == LayoutType.RowMajor:
|
||||
alignment_n = 4
|
||||
if A_is_s8 and B_is_s8 and (operation.A.layout == LayoutType.RowMajor or operation.B.layout == LayoutType.ColumnMajor):
|
||||
alignment_k = 4
|
||||
|
||||
threadblock_k = operation.tile_description.threadblock_shape[2]
|
||||
|
||||
assert testcase != "interleaved"
|
||||
|
||||
supports_split_k = operation.arch < 90 and not operation.swizzling_functor == SwizzlingFunctor.StreamK
|
||||
|
||||
if testcase == "multistage":
|
||||
modes = [GemmUniversalMode.Gemm]
|
||||
problem_size_m = [16, 528]
|
||||
problem_size_n = [16, 528]
|
||||
problem_size_k = [
|
||||
threadblock_k,
|
||||
threadblock_k * operation.tile_description.stages
|
||||
+ operation.tile_description.math_instruction.instruction_shape[2],
|
||||
]
|
||||
problem_alpha = [1.0]
|
||||
problem_beta = [0.0]
|
||||
batch_counts = [1]
|
||||
else:
|
||||
modes = [GemmUniversalMode.Gemm]
|
||||
batch_counts = [1, 2, 3, 5, 7]
|
||||
if supports_split_k:
|
||||
modes.append(GemmUniversalMode.GemmSplitKParallel)
|
||||
|
||||
problem_size_m = [alignment_m, 512 - 3 * alignment_m]
|
||||
problem_size_n = [alignment_n, 512 - 2 * alignment_n]
|
||||
if operation.tile_description.stages is None:
|
||||
stages_for_k_calc = 7
|
||||
else:
|
||||
stages_for_k_calc = operation.tile_description.stages
|
||||
problem_size_k = [
|
||||
alignment_k,
|
||||
threadblock_k * stages_for_k_calc - alignment_k,
|
||||
threadblock_k * stages_for_k_calc * 3 - alignment_k,
|
||||
]
|
||||
problem_alpha = [1.0]
|
||||
problem_beta = [2.0]
|
||||
|
||||
testbed = GemmUniversalLauncher(operation, compiler_mode=compilation_mode)
|
||||
|
||||
for mode in modes:
|
||||
for m in problem_size_m:
|
||||
for n in problem_size_n:
|
||||
for k in problem_size_k:
|
||||
for batch_count in batch_counts:
|
||||
for alpha in problem_alpha:
|
||||
for beta in problem_beta:
|
||||
# skip very small K problems
|
||||
if testcase == "universal":
|
||||
if k // batch_count < 2 * threadblock_k:
|
||||
continue
|
||||
|
||||
problem_size = GemmCoord(m, n, k)
|
||||
|
||||
if supports_split_k:
|
||||
split_k_slices = batch_count
|
||||
else:
|
||||
split_k_slices = 1
|
||||
|
||||
overridden_mode = mode
|
||||
if mode == GemmUniversalMode.Gemm and batch_count > 1:
|
||||
overridden_mode = GemmUniversalMode.Batched
|
||||
|
||||
passed = testbed.run(
|
||||
overridden_mode,
|
||||
problem_size,
|
||||
batch_count,
|
||||
split_k_slices,
|
||||
alpha,
|
||||
beta,
|
||||
)
|
||||
|
||||
if not passed:
|
||||
return False
|
||||
|
||||
return passed
|
||||
@ -30,12 +30,14 @@
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import pathlib
|
||||
import unittest
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
loader = unittest.TestLoader()
|
||||
tests = loader.discover('./', 'conv2d_*.py')
|
||||
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
|
||||
tests = loader.discover(script_dir, 'gemm_*.py')
|
||||
testRunner = unittest.runner.TextTestRunner()
|
||||
results = testRunner.run(tests)
|
||||
if not results.wasSuccessful():
|
||||
239
test/python/cutlass/gemm/utils.py
Normal file
239
test/python/cutlass/gemm/utils.py
Normal file
@ -0,0 +1,239 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import cutlass
|
||||
|
||||
from cutlass import (
|
||||
DataTypeNames,
|
||||
EpilogueScheduleSuffixes,
|
||||
KernelScheduleSuffixes,
|
||||
LayoutType,
|
||||
OpcodeClassNames,
|
||||
ShortDataTypeNames,
|
||||
ShortLayoutTypeNames
|
||||
)
|
||||
from cutlass.backend import library
|
||||
from cutlass.backend.utils.software import SubstituteTemplate
|
||||
|
||||
from gemm_testbed import test_all_gemm
|
||||
|
||||
|
||||
class Layout:
|
||||
"""
|
||||
Utility class to map transpose and non-transpose terminology to row- and column-major terminology
|
||||
"""
|
||||
|
||||
T = LayoutType.RowMajor
|
||||
N = LayoutType.ColumnMajor
|
||||
|
||||
|
||||
class LayoutCombination:
|
||||
"""
|
||||
Utility class defining all combinations of row- and column-major layouts for operands to a GEMMs
|
||||
"""
|
||||
|
||||
NNN = (Layout.N, Layout.N, Layout.N)
|
||||
NNT = (Layout.N, Layout.N, Layout.T)
|
||||
NTN = (Layout.N, Layout.T, Layout.N)
|
||||
NTT = (Layout.N, Layout.T, Layout.T)
|
||||
TNN = (Layout.T, Layout.N, Layout.N)
|
||||
TNT = (Layout.T, Layout.N, Layout.T)
|
||||
TTN = (Layout.T, Layout.T, Layout.N)
|
||||
TTT = (Layout.T, Layout.T, Layout.T)
|
||||
|
||||
|
||||
def get_name(
|
||||
layouts,
|
||||
alignments,
|
||||
element_output,
|
||||
element_accumulator,
|
||||
element_epilogue,
|
||||
cluster_shape,
|
||||
threadblock_shape,
|
||||
stages,
|
||||
element_a,
|
||||
element_b,
|
||||
arch,
|
||||
opclass,
|
||||
kernel_schedule=None,
|
||||
epilogue_schedule=None,
|
||||
suffix="",
|
||||
):
|
||||
"""
|
||||
Generates a procedural name for a test case.
|
||||
|
||||
:param layouts: indexable container of layouts of A, B, and C operands
|
||||
:param alignments: indexable container of alignments of A, B, and C operands
|
||||
:param element_output: data type of the output element
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:param element_epilogue: data type used in computing the epilogue
|
||||
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
|
||||
:param threadblock_shape: indexable container of dimensions of threadblock tiles
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param element_a: data type of operand A
|
||||
:param element_b: data type of operand B
|
||||
:param arch: compute capability of kernel being generated
|
||||
:type arch: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpcodeClass
|
||||
:param kernel_schedule: kernel_schedule type
|
||||
:type kernel_schedule: cutlass.KernelScheduleType
|
||||
:param epilogue_schedule: epilogue_schedule type
|
||||
:type epilogue_schedule: cutlass.EpilogueScheduleType
|
||||
:param suffix: additional string to add to the suffix of the name
|
||||
:type suffix: str
|
||||
|
||||
:return: str
|
||||
"""
|
||||
name_format = "test_SM${arch}_Device_Gemm_${eA}${lA}_${eB}${lB}_${eC}${lC}_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${cM}x${cN}x${cK}_${stages}_align${aA}-${aB}-${aC}${k}${e}${suffix}"
|
||||
return SubstituteTemplate(
|
||||
name_format,
|
||||
{
|
||||
"arch": str(arch),
|
||||
"eA": DataTypeNames[element_a],
|
||||
"eB": DataTypeNames[element_b],
|
||||
"eC": DataTypeNames[element_output],
|
||||
"lA": ShortLayoutTypeNames[layouts[0]],
|
||||
"lB": ShortLayoutTypeNames[layouts[1]],
|
||||
"lC": ShortLayoutTypeNames[layouts[2]],
|
||||
"opclass": OpcodeClassNames[opclass],
|
||||
"acc": DataTypeNames[element_accumulator],
|
||||
"cM": str(cluster_shape[0]),
|
||||
"cN": str(cluster_shape[1]),
|
||||
"cK": str(cluster_shape[2]),
|
||||
"tbM": str(threadblock_shape[0]),
|
||||
"tbN": str(threadblock_shape[1]),
|
||||
"tbK": str(threadblock_shape[2]),
|
||||
"stages": str(stages) if stages is not None else "auto",
|
||||
"aA": str(alignments[0]),
|
||||
"aB": str(alignments[1]),
|
||||
"aC": str(alignments[2]),
|
||||
"k": "" if kernel_schedule is None else KernelScheduleSuffixes[kernel_schedule],
|
||||
"e": "" if epilogue_schedule is None else EpilogueScheduleSuffixes[epilogue_schedule],
|
||||
"suffix": "" if suffix is None else suffix,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def add_test_gemm(
|
||||
cls=None,
|
||||
cc=None,
|
||||
element=None,
|
||||
layouts=None,
|
||||
alignments=None,
|
||||
element_output=None,
|
||||
element_accumulator=None,
|
||||
cluster_shape=None,
|
||||
threadblock_shape=None,
|
||||
warp_count=None,
|
||||
stages=None,
|
||||
opclass=None,
|
||||
swizzle=None,
|
||||
kernel_schedule=None,
|
||||
epilogue_schedule=None,
|
||||
compilation_modes=['nvcc', 'nvrtc']):
|
||||
"""
|
||||
Create test-running functions with the given specification and set it as a method of ``cls``.
|
||||
|
||||
:param cls: class to which the generated method will be added
|
||||
:type cls: type
|
||||
:param cc: compute capability to compile for
|
||||
:type cc: int
|
||||
:param element: data type of A and B operands
|
||||
:type element: cutlass.DataType.f16
|
||||
:param layouts: layouts of A, B, and C operands
|
||||
:type layouts: list or tuple
|
||||
:param alignments: alingments of A, B, and C operands
|
||||
:type alignments: list or tuple
|
||||
:param element_output: data type of the output element
|
||||
:type element_output: cutlass.DataType
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:type element_accumulator: cutlass.DataType
|
||||
:param cluster_shape: dimensions of clusters
|
||||
:type cluster_shape: list or tuple
|
||||
:param threadblock_shape: dimensions of threadblock tiles
|
||||
:type threadblock_shape: list or tuple
|
||||
:param warp_count: warps to be launched per threadblock dimension
|
||||
:type warp_count: list or tuple
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpcodeClass
|
||||
:param swizzle: threadblock swizzling functor
|
||||
:param kernel_schedule: kernel schedule to use
|
||||
:type kernel_schedule: cutlass.KernelScheduleType
|
||||
:param epilogue_schedule: epilogue schedule to use
|
||||
:type epilogue_schedule: cutlass.EpilogueScheduleType
|
||||
:param compilation_modes: list of compilers to used in testing the kernel (options: 'nvrtc', 'nvcc')
|
||||
:type compilation_modes: list
|
||||
"""
|
||||
|
||||
for compilation_mode in compilation_modes:
|
||||
def run(self):
|
||||
"""
|
||||
Dynamically-generated function that constructs a GEMM operation and verifies it against
|
||||
multiple test cases.
|
||||
"""
|
||||
element_A = element
|
||||
element_B = element
|
||||
layout_A, layout_B, layout_C = layouts
|
||||
alignment_A, alignment_B, alignment_C = alignments
|
||||
|
||||
plan = cutlass.op.Gemm(element_A=element_A, element_B=element_B,
|
||||
element_C=element_output, element_D=element_output,
|
||||
layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
|
||||
element_accumulator=element_accumulator,
|
||||
kernel_cc=cc)
|
||||
|
||||
plan.opclass = opclass
|
||||
if swizzle is not None:
|
||||
plan.swizzling_functor = swizzle
|
||||
|
||||
td = plan.tile_descriptions()[0]
|
||||
|
||||
if warp_count is not None:
|
||||
td.warp_count = warp_count
|
||||
td.threadblock_shape = threadblock_shape
|
||||
td.stages = stages
|
||||
td.cluster_shape = cluster_shape
|
||||
op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
|
||||
self.assertTrue(test_all_gemm(op, 'universal', compilation_mode=compilation_mode))
|
||||
|
||||
element_epilogue = element_accumulator
|
||||
name = get_name(
|
||||
layouts=layouts, alignments=alignments, element_output=element_output, element_accumulator=element_accumulator,
|
||||
element_epilogue=element_epilogue, cluster_shape=cluster_shape, threadblock_shape=threadblock_shape,
|
||||
stages=stages, element_a=element, element_b=element, arch=cc, opclass=opclass,
|
||||
kernel_schedule=kernel_schedule, epilogue_schedule=epilogue_schedule, suffix=f'_{compilation_mode}')
|
||||
|
||||
setattr(cls, name, run)
|
||||
@ -38,7 +38,6 @@ from math import ceil
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
import cutlass_bindings
|
||||
import cutlass.utils.datatypes as datatypes
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
from utils import ExpectException
|
||||
245
test/python/cutlass/interface/evt_interface.py
Normal file
245
test/python/cutlass/interface/evt_interface.py
Normal file
@ -0,0 +1,245 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Test the EVT interface
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
from cutlass import LayoutType, Tensor
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
from cutlass.epilogue import reshape, permute
|
||||
|
||||
from utils import ExpectException
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
|
||||
class EVTErrorTests(unittest.TestCase):
|
||||
"""
|
||||
Tests various error scenarios that arise with the EVT interface
|
||||
"""
|
||||
@unittest.skipIf(device_cc() != 90, "Only Sm90 EVT requires root node be 'D'")
|
||||
def test_root_not_d(self):
|
||||
"""
|
||||
Test when "D" does not exist in Sm90 EVT
|
||||
"""
|
||||
def evt_root_not_d(accum, alpha):
|
||||
F = accum * alpha
|
||||
return F
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"alpha": 1.2,
|
||||
"F": self.fake_tensor(np.float16, (6, 512, 512))
|
||||
}
|
||||
|
||||
with ExpectException(device_cc() == 90,
|
||||
"SyntaxError: Sm90 EVT requires the epilogue to have a returned tensor D, "
|
||||
"but the variable 'D' is not found in the return values.", True):
|
||||
|
||||
cutlass.epilogue.trace(evt_root_not_d, example_tensors)
|
||||
|
||||
def test_no_accum(self):
|
||||
"""
|
||||
Test when "accum" is not in input arguments
|
||||
"""
|
||||
def evt_no_accum(alpha, C):
|
||||
D = alpha * C
|
||||
return D
|
||||
|
||||
example_tensors = {
|
||||
"C": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"alpha": 1.2,
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 512))
|
||||
}
|
||||
|
||||
with ExpectException(True, "SyntaxError: Cannot find 'accum' in the argument list.", True):
|
||||
cutlass.epilogue.trace(evt_no_accum, example_tensors)
|
||||
|
||||
@unittest.skipIf(device_cc() != 90, "Only Sm90 EVT has concern on smem size")
|
||||
def test_too_much_shared_memory(self):
|
||||
"""
|
||||
Test when the epilogue consumes too much shared memory
|
||||
"""
|
||||
def evt_too_much_shared_memory(accum, C1, C2, C3, C4, C5):
|
||||
D1 = accum + C1
|
||||
D2 = D1 + C2
|
||||
D3 = D2 + C3
|
||||
D4 = D3 + C4
|
||||
D = D4 + C5
|
||||
return D, D1, D2, D3, D4
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C1": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C2": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C3": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C4": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C5": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D1": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D2": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D3": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D4": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 512))
|
||||
}
|
||||
|
||||
epilogue_visitor = cutlass.epilogue.trace(evt_too_much_shared_memory, example_tensors)
|
||||
|
||||
plan = cutlass.op.Gemm(
|
||||
element=np.float16, layout=cutlass.LayoutType.RowMajor,
|
||||
element_accumulator=np.float32
|
||||
)
|
||||
|
||||
with ExpectException(True,
|
||||
"RuntimeError: The epilogue consumes too much shared memory. "
|
||||
"No valid tile description is found in the generator.", True):
|
||||
plan.epilogue_visitor = epilogue_visitor
|
||||
|
||||
def test_not_ssa(self):
|
||||
"""
|
||||
Test when the epilogue is not in SSA
|
||||
"""
|
||||
def evt_redefine(accum, C, alpha):
|
||||
F = accum + C
|
||||
F = F * alpha
|
||||
D = F
|
||||
return D, F
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"alpha": 1.5,
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"F": self.fake_tensor(np.float16, (6, 512, 512))
|
||||
}
|
||||
|
||||
with ExpectException(True, "SyntaxError: Variable 'F' cannot be defined twice.", True):
|
||||
cutlass.epilogue.trace(evt_redefine, example_tensors)
|
||||
|
||||
def evt_undefine(accum, alpha):
|
||||
F = accum + C
|
||||
D = F * alpha
|
||||
return D, F
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"alpha": 1.5,
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"F": self.fake_tensor(np.float16, (6, 512, 512))
|
||||
}
|
||||
|
||||
with ExpectException(True, "SyntaxError: Variable 'C' is undefined.", True):
|
||||
cutlass.epilogue.trace(evt_undefine, example_tensors)
|
||||
|
||||
def test_missing_example_tensor(self):
|
||||
"""
|
||||
Test when the example tensor of an input/output variable is not provided
|
||||
"""
|
||||
def evt_missing_example_tensor(accum, C):
|
||||
D = accum + C
|
||||
return D
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
}
|
||||
|
||||
with ExpectException(True, "RuntimeError: Example input for D is not provided.", True):
|
||||
cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
}
|
||||
|
||||
with ExpectException(True, "RuntimeError: Example input for C is not provided.", True):
|
||||
cutlass.epilogue.trace(evt_missing_example_tensor, example_tensors)
|
||||
|
||||
def test_return_expression(self):
|
||||
"""
|
||||
Test when the return value is an expression
|
||||
"""
|
||||
def evt_return_expr(accum, C):
|
||||
return accum + C
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"C": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
}
|
||||
|
||||
with ExpectException(True, "SyntaxError: Return value cannot be an expression", True):
|
||||
cutlass.epilogue.trace(evt_return_expr, example_tensors)
|
||||
|
||||
def test_incompatible_shape(self):
|
||||
"""
|
||||
Test when the shape of example tensors are incompatible
|
||||
"""
|
||||
def evt_incompatible_shape(accum, C):
|
||||
D = accum + C
|
||||
return D
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 256, 512)),
|
||||
"C": self.fake_tensor(np.float16, (6, 512, 512)),
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 512))
|
||||
}
|
||||
|
||||
with ExpectException(True,
|
||||
"RuntimeError: Dimension mismatch between accum(6, 256, 512), C(6, 512, 512).", True):
|
||||
cutlass.epilogue.trace(evt_incompatible_shape, example_tensors)
|
||||
|
||||
def test_no_matching_impl(self):
|
||||
def evt_no_matching_impl(accum, bias):
|
||||
D = accum + reshape(permute(bias, indices=(1, 0)), new_shape=(512, 1))
|
||||
return D
|
||||
|
||||
example_tensors = {
|
||||
"accum": self.fake_tensor(np.float16, (6, 512, 256)),
|
||||
"bias": self.fake_tensor(np.float16, (16, 32)),
|
||||
"D": self.fake_tensor(np.float16, (6, 512, 256))
|
||||
}
|
||||
|
||||
with ExpectException(True, "NotImplementedError: No matching op for node bias with stride (0, (1, 32), 0).", True):
|
||||
cutlass.epilogue.trace(evt_no_matching_impl, example_tensors)
|
||||
#
|
||||
# Helper functions
|
||||
#
|
||||
|
||||
def fake_tensor(self, element, shape):
|
||||
return Tensor(element=element, shape=shape, layout_tag=LayoutType.RowMajor)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
@ -38,7 +38,6 @@ from math import ceil
|
||||
import unittest
|
||||
|
||||
import cutlass
|
||||
import cutlass_bindings
|
||||
import cutlass.utils.datatypes as datatypes
|
||||
from cutlass.backend.utils.device import device_cc
|
||||
from utils import ExpectException
|
||||
@ -262,13 +261,13 @@ class GemmErrorTests(unittest.TestCase):
|
||||
|
||||
# Ensure that all tile descriptions have opclass of TensorOp
|
||||
for td in plan.tile_descriptions():
|
||||
assert td.math_instruction.opcode_class == cutlass_bindings.OpClass.TensorOp
|
||||
assert td.math_instruction.opcode_class == cutlass.OpcodeClass.TensorOp
|
||||
|
||||
plan.opclass = cutlass.OpcodeClass.Simt
|
||||
|
||||
# Ensure that all tile descriptions have opclass of Simt
|
||||
for td in plan.tile_descriptions():
|
||||
assert td.math_instruction.opcode_class == cutlass_bindings.OpClass.Simt
|
||||
assert td.math_instruction.opcode_class == cutlass.OpcodeClass.Simt
|
||||
|
||||
def test_invalid_tile_description(self):
|
||||
"""
|
||||
@ -50,9 +50,10 @@ class ExpectException:
|
||||
:param message: message to print if an exception is raised when not expected or vice versa
|
||||
:type message: str
|
||||
"""
|
||||
def __init__(self, exception_expected: bool, message: str = ''):
|
||||
def __init__(self, exception_expected: bool, message: str = '', verify_msg=False):
|
||||
self.exception_expected = exception_expected
|
||||
self.message = message
|
||||
self.verify_msg = verify_msg
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
@ -60,6 +61,9 @@ class ExpectException:
|
||||
def __exit__(self, exc_type, exc_val, traceback):
|
||||
exception_raised = exc_type is not None
|
||||
assert self.exception_expected == exception_raised, self.message
|
||||
if self.verify_msg:
|
||||
exc_message = f"{exc_type.__name__}: {exc_val}"
|
||||
assert exc_message == self.message, f"expect error message {self.message}, got {exc_message}"
|
||||
|
||||
# Suppress the exception
|
||||
return True
|
||||
75
test/python/pycute/run_all_tests.py
Normal file
75
test/python/pycute/run_all_tests.py
Normal file
@ -0,0 +1,75 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Utility script for discovering and running all PyCuTe tests
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import pathlib
|
||||
import unittest
|
||||
|
||||
|
||||
def numeric_log_level(log_level: str) -> int:
|
||||
"""
|
||||
Converts the string identifier of the log level into the numeric identifier used
|
||||
in setting the log level
|
||||
|
||||
:param x: string representation of log level (e.g., 'INFO', 'DEBUG')
|
||||
:type x: str
|
||||
|
||||
:return: numeric representation of log level
|
||||
:rtype: int
|
||||
"""
|
||||
numeric_level = getattr(logging, log_level.upper(), None)
|
||||
if not isinstance(numeric_level, int):
|
||||
raise ValueError(f"Invalid log level: {log_level}")
|
||||
return numeric_level
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
|
||||
help='Logging level to be used by the generator script')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set the logging level based on the user-provided `--log-level` command-line option
|
||||
logging.basicConfig(level=args.log_level)
|
||||
|
||||
loader = unittest.TestLoader()
|
||||
script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
|
||||
tests = loader.discover(script_dir, "test_*.py")
|
||||
test_runner = unittest.runner.TextTestRunner()
|
||||
results = test_runner.run(tests)
|
||||
if not results.wasSuccessful():
|
||||
raise Exception("Test cases failed")
|
||||
95
test/python/pycute/test_coalesce.py
Normal file
95
test/python/pycute/test_coalesce.py
Normal file
@ -0,0 +1,95 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Unit tests for pycute.coalesce
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from pycute import *
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TestCoalesce(unittest.TestCase):
|
||||
def helper_test_coalesce(self, layout):
|
||||
layoutR = coalesce(layout)
|
||||
|
||||
_LOGGER.debug(f"{layout} => {layoutR}")
|
||||
|
||||
self.assertEqual(size(layoutR), size(layout))
|
||||
|
||||
for i in range(size(layout)):
|
||||
self.assertEqual(layoutR(i), layout(i))
|
||||
|
||||
def test_coalesce(self):
|
||||
layout = Layout(1,0)
|
||||
self.helper_test_coalesce(layout)
|
||||
|
||||
layout = Layout(1,1)
|
||||
self.helper_test_coalesce(layout)
|
||||
|
||||
layout = Layout((2,4))
|
||||
self.helper_test_coalesce(layout)
|
||||
|
||||
layout = Layout((2,4,6))
|
||||
self.helper_test_coalesce(layout)
|
||||
|
||||
layout = Layout((2,4,6), (1,6,2))
|
||||
self.helper_test_coalesce(layout)
|
||||
|
||||
layout = Layout((2,1,6), (1,7,2))
|
||||
self.helper_test_coalesce(layout)
|
||||
|
||||
layout = Layout((2,1,6), (4,7,8))
|
||||
self.helper_test_coalesce(layout)
|
||||
|
||||
layout = Layout((2,(4,6)))
|
||||
self.helper_test_coalesce(layout)
|
||||
|
||||
layout = Layout((2,4), (4,1))
|
||||
self.helper_test_coalesce(layout)
|
||||
|
||||
layout = Layout((2,4,6), (24,6,1))
|
||||
self.helper_test_coalesce(layout)
|
||||
|
||||
layout = Layout((2,1,3), (2,4,4))
|
||||
self.helper_test_coalesce(layout)
|
||||
|
||||
layout = Layout(((2,2),(2,2)), ((1,4),(8,32)))
|
||||
self.helper_test_coalesce(layout)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
92
test/python/pycute/test_complement.py
Normal file
92
test/python/pycute/test_complement.py
Normal file
@ -0,0 +1,92 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Unit tests for pycute.complement
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from pycute import *
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TestComplement(unittest.TestCase):
|
||||
def helper_test_complement(self, layout):
|
||||
layoutR = complement(layout)
|
||||
|
||||
_LOGGER.debug(f"{layout} => {layoutR}")
|
||||
|
||||
# Post-condition: test disjointness of the codomains
|
||||
for a in range(size(layout)):
|
||||
for b in range(size(layoutR)):
|
||||
assert (layout(a) != layoutR(b)) or (layout(a) == 0 and layoutR(b) == 0)
|
||||
|
||||
def test_complement(self):
|
||||
test = Layout(1,0)
|
||||
self.helper_test_complement(test)
|
||||
|
||||
test = Layout(1,1)
|
||||
self.helper_test_complement(test)
|
||||
|
||||
test = Layout(4,0)
|
||||
self.helper_test_complement(test)
|
||||
|
||||
test = Layout((2,4),(1,2))
|
||||
self.helper_test_complement(test)
|
||||
|
||||
test = Layout((2,3),(1,2))
|
||||
self.helper_test_complement(test)
|
||||
|
||||
test = Layout((2,4),(1,4))
|
||||
self.helper_test_complement(test)
|
||||
|
||||
test = Layout((2,4,8),(8,1,64))
|
||||
self.helper_test_complement(test)
|
||||
|
||||
test = Layout(((2,2),(2,2)),((1,4),(8,32)))
|
||||
self.helper_test_complement(test)
|
||||
|
||||
test = Layout((2,(3,4)),(3,(1,6)))
|
||||
self.helper_test_complement(test)
|
||||
|
||||
test = Layout((4,6),(1,6))
|
||||
self.helper_test_complement(test)
|
||||
|
||||
test = Layout((4,10),(1,10))
|
||||
self.helper_test_complement(test)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
204
test/python/pycute/test_composition.py
Normal file
204
test/python/pycute/test_composition.py
Normal file
@ -0,0 +1,204 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Unit tests for pycute.composition
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from pycute import *
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TestComposition(unittest.TestCase):
|
||||
def helper_test_composition(self, layoutA, layoutB):
|
||||
layoutR = composition(layoutA, layoutB)
|
||||
|
||||
_LOGGER.debug(f"{layoutA} o {layoutB} => {layoutR}")
|
||||
|
||||
# True post-condition: Every coordinate c of layoutB with L1D(c) < size(layoutR) is a coordinate of layoutR.
|
||||
|
||||
# Test that R(c) = A(B(c)) for all coordinates c in layoutR
|
||||
for i in range(size(layoutR)):
|
||||
self.assertEqual(layoutR(i), layoutA(layoutB(i)))
|
||||
|
||||
def test_composition(self):
|
||||
layoutA = Layout(1,0)
|
||||
layoutB = Layout(1,0)
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout(1,0)
|
||||
layoutB = Layout(1,1)
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout(1,1)
|
||||
layoutB = Layout(1,0)
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout(1,1)
|
||||
layoutB = Layout(1,1)
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4))
|
||||
layoutB = Layout((4))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4), (2))
|
||||
layoutB = Layout((4))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4))
|
||||
layoutB = Layout((4), (2))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4), (0))
|
||||
layoutB = Layout((4))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4))
|
||||
layoutB = Layout((4), (0))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((1), (0))
|
||||
layoutB = Layout((4))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4))
|
||||
layoutB = Layout((1), (0))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4))
|
||||
layoutB = Layout((2))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4), (2))
|
||||
layoutB = Layout((2))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4))
|
||||
layoutB = Layout((2), (2))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4), (2))
|
||||
layoutB = Layout((2), (2))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((12))
|
||||
layoutB = Layout((4,3))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((12), (2))
|
||||
layoutB = Layout((4,3))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((12))
|
||||
layoutB = Layout((4,3), (3,1))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((12), (2))
|
||||
layoutB = Layout((4,3), (3,1))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((12))
|
||||
layoutB = Layout((2,3), (2,4))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4,3))
|
||||
layoutB = Layout((4,3))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4,3))
|
||||
layoutB = Layout((12))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4,3))
|
||||
layoutB = Layout((6), (2))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4,3))
|
||||
layoutB = Layout((6,2), (2,1))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4,3), (3,1))
|
||||
layoutB = Layout((4,3))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4,3), (3,1))
|
||||
layoutB = Layout((12))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4,3), (3,1))
|
||||
layoutB = Layout((6), (2))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4,3), (3,1))
|
||||
layoutB = Layout((6,2), (2,1))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((8,8))
|
||||
layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((8,8), (8,1))
|
||||
layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
|
||||
layoutB = Layout(8, 4)
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout(((4,2)), ((1,16)))
|
||||
layoutB = Layout((4,2), (2,1))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((2,2), (2,1))
|
||||
layoutB = Layout((2,2), (2,1))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4,8,2))
|
||||
layoutB = Layout((2,2,2), (2,8,1))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4,8,2), (2,8,1))
|
||||
layoutB = Layout((2,2,2), (1,8,2))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
layoutA = Layout((4,8,2), (2,8,1))
|
||||
layoutB = Layout((4,2,2), (2,8,1))
|
||||
self.helper_test_composition(layoutA, layoutB)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
80
test/python/pycute/test_int_tuple.py
Normal file
80
test/python/pycute/test_int_tuple.py
Normal file
@ -0,0 +1,80 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Unit tests for pycute.int_tuple
|
||||
"""
|
||||
|
||||
import unittest
|
||||
|
||||
from pycute import *
|
||||
|
||||
|
||||
class TestIntTuple(unittest.TestCase):
|
||||
def test_product(self):
|
||||
self.assertEqual(product(2), 2)
|
||||
|
||||
self.assertEqual(product((3,2)), 6)
|
||||
|
||||
self.assertEqual(product(product(((2,3),4))), 24)
|
||||
|
||||
def test_inner_product(self):
|
||||
self.assertEqual(inner_product(2, 3), 6)
|
||||
|
||||
self.assertEqual(inner_product((1,2), (3,2)), 7)
|
||||
|
||||
self.assertEqual(inner_product(((2,3),4), ((2,1),2)), 15)
|
||||
|
||||
def test_shape_div(self):
|
||||
self.assertEqual(shape_div((3,4), 6), (1,2))
|
||||
|
||||
self.assertEqual(shape_div((3,4), 12), (1,1))
|
||||
|
||||
self.assertEqual(shape_div((3,4), 36), (1,1))
|
||||
|
||||
self.assertEqual(shape_div(((3,4),6), 36), ((1,1),2))
|
||||
|
||||
self.assertEqual(shape_div((6,(3,4)), 36), (1,(1,2)))
|
||||
|
||||
def test_prefix_product(self):
|
||||
self.assertEqual(prefix_product(2), 1)
|
||||
|
||||
self.assertEqual(prefix_product((3,2)), (1,3))
|
||||
|
||||
self.assertEqual(prefix_product((3,2,4)), (1,3,6))
|
||||
|
||||
self.assertEqual(prefix_product(((2,3),4)), ((1,2),6))
|
||||
|
||||
self.assertEqual(prefix_product(((2,3),(2, 1, 2),( 5, 2, 1))),
|
||||
((1,2),(6,12,12),(24,120,240)))
|
||||
|
||||
|
||||
87
test/python/pycute/test_left_inverse.py
Normal file
87
test/python/pycute/test_left_inverse.py
Normal file
@ -0,0 +1,87 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Unit tests for pycute.left_inverse
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from pycute import *
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TestLeftInverse(unittest.TestCase):
|
||||
def helper_test_left_inverse(self, layout):
|
||||
inv_layout = left_inverse(layout)
|
||||
|
||||
_LOGGER.debug(f"{layout} => {inv_layout}")
|
||||
|
||||
for i in range(size(layout)):
|
||||
self.assertEqual(inv_layout(layout(i)), i)
|
||||
|
||||
def test_left_inverse(self):
|
||||
test = Layout(1,0)
|
||||
self.helper_test_left_inverse(test)
|
||||
|
||||
test = Layout((1,1),(0,0))
|
||||
self.helper_test_left_inverse(test)
|
||||
|
||||
test = Layout(1,1)
|
||||
self.helper_test_left_inverse(test)
|
||||
|
||||
test = Layout(4,1)
|
||||
self.helper_test_left_inverse(test)
|
||||
|
||||
test = Layout(4,2)
|
||||
self.helper_test_left_inverse(test)
|
||||
|
||||
test = Layout((8,4),(1,8))
|
||||
self.helper_test_left_inverse(test)
|
||||
|
||||
test = Layout((8,4),(4,1))
|
||||
self.helper_test_left_inverse(test)
|
||||
|
||||
test = Layout((2,4,6),(1,2,8))
|
||||
self.helper_test_left_inverse(test)
|
||||
|
||||
test = Layout((2,4,6),(4,1,8))
|
||||
self.helper_test_left_inverse(test)
|
||||
|
||||
test = Layout((4,2),(1,16))
|
||||
self.helper_test_left_inverse(test)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
96
test/python/pycute/test_right_inverse.py
Normal file
96
test/python/pycute/test_right_inverse.py
Normal file
@ -0,0 +1,96 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Unit tests for pycute.left_inverse
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from pycute import *
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TestRightInverse(unittest.TestCase):
|
||||
def helper_test_right_inverse(self, layout):
|
||||
inv_layout = right_inverse(layout)
|
||||
|
||||
_LOGGER.debug(f"{layout} => {inv_layout}")
|
||||
|
||||
for i in range(size(inv_layout)):
|
||||
self.assertEqual(layout(inv_layout(i)), i)
|
||||
|
||||
def test_right_inverse(self):
|
||||
test = Layout(1,0)
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
test = Layout((1,1),(0,0))
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
test = Layout((3,7),(0,0))
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
test = Layout(1,1)
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
test = Layout(4,0)
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
test = Layout(4,1)
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
test = Layout(4,2)
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
test = Layout((2,4),(0,2))
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
test = Layout((8,4),(1,8))
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
test = Layout((8,4),(4,1))
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
test = Layout((2,4,6),(1,2,8))
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
test = Layout((2,4,6),(4,1,8))
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
test = Layout((4,2),(1,16))
|
||||
self.helper_test_right_inverse(test)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@ -1,6 +1,6 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
@ -30,12 +30,30 @@
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import cutlass.backend
|
||||
"""
|
||||
Unit tests for pycute.typing
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
from pycute import *
|
||||
|
||||
_LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TestTyping(unittest.TestCase):
|
||||
def helper_test_typing(self, _cls, _obj, cls, expected: bool):
|
||||
_LOGGER.debug(f"issubclass({_cls}, {cls})")
|
||||
_LOGGER.debug(f"isinstance({_obj}, {cls})")
|
||||
|
||||
self.assertEqual(expected, issubclass(_cls, cls))
|
||||
self.assertEqual(expected, isinstance(_obj, cls))
|
||||
|
||||
def test_typing(self):
|
||||
self.helper_test_typing(int, 1, Integer, True)
|
||||
self.helper_test_typing(float, 1., Integer, False)
|
||||
self.helper_test_typing(str, 'hi', Integer, False)
|
||||
self.helper_test_typing(bool, False, Integer, False)
|
||||
|
||||
if __name__ == '__main__':
|
||||
cutlass.backend.get_memory_pool(2**30, 2**30)
|
||||
loader = unittest.TestLoader()
|
||||
tests = loader.discover('./', 'gemm_*.py')
|
||||
testRunner = unittest.runner.TextTestRunner()
|
||||
testRunner.run(tests)
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user