CUTLASS 3.2 (#1024)

* CUTLASS 3.2
2023-08-07 14:50:32 -10:00
parent a0d787b746
commit 4575443d44
392 changed files with 47559 additions and 7940 deletions
--- a/test/python/conv2d/conv2d_test_utils.py
+++ b/test/python/conv2d/conv2d_test_utils.py
@ -0,0 +1,508 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Util Functions for Conv2d Test
+"""
+import torch
+import cutlass
+import unittest
+import cutlass_bindings
+from cutlass.utils.datatypes import binding_type, binding_opclass
+from cutlass.backend.test.conv2d_testbed import Conv2dLauncher, getTensorRef, getTensorView
+from cutlass.backend.utils.device import device_cc
+from cutlass.backend.test.utils import get_name_conv2d
+import numpy as np
+
+def conv2d_few_channel_problemsizes(channels):
+    problem_sizes = [
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 8, 8, channels),
+            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(2, 2),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
+            cutlass_bindings.Tensor4DCoord(16, 3, 3, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(2, 2),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 16, 16, channels),
+            cutlass_bindings.Tensor4DCoord(16, 7, 7, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
+            cutlass_bindings.Tensor4DCoord(32, 7, 7, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
+            cutlass_bindings.Tensor4DCoord(64, 7, 7, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(2, 2),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
+            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 224, 224, channels),
+            cutlass_bindings.Tensor4DCoord(64, 5, 5, channels),
+            cutlass_bindings.Tensor4DCoord(1, 1, 1, 1),
+            cutlass_bindings.MatrixCoord(2, 2),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+torch_dtype = {
+    cutlass.DataType.f16: torch.float16,
+    cutlass.DataType.f32: torch.float32,
+    cutlass.DataType.f64: torch.float64
+}
+
+numpy_dtype = {
+    cutlass.DataType.f16: np.float16,
+    cutlass.DataType.f32: np.float32,
+    cutlass.DataType.f64: np.float64
+}
+
+
+def validate_problem_size(ps, conv_kind, split_k_slices):
+    P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
+    Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
+    if P != ps.P or Q != ps.Q:
+        return False
+
+    # Split-K (serial or parallel) is not supported for strided dgrad
+    if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
+        return False
+    return True
+
+
+# Override the backend launcher
+class Conv2dLauncherFrontend(Conv2dLauncher):
+    def __init__(self, plan: cutlass.Conv2d, seed: int = 80, backend="numpy"):
+        self.operation = plan
+        self.conv_kind = plan.conv_kind
+        self.seed = seed
+        self.backend = backend
+        
+        self.dtype_A = plan._element_a
+        self.dtype_B = plan._element_b
+        self.dtype_C = plan._element_c
+        self.dtype_acc = plan._element_accumulator
+        
+        self.layout_A = cutlass_bindings.TensorNHWC
+        self.layout_B = cutlass_bindings.TensorNHWC
+        self.layout_C = cutlass_bindings.TensorNHWC
+        self.layout_D = cutlass_bindings.TensorNHWC
+        
+        self.element_compute = cutlass_bindings.float32
+        self.enable_cached_results = True
+        
+        # Get randomization_max
+        if self.dtype_A in [cutlass.DataType.f16, cutlass.DataType.bf16]:
+            if self.dtype_acc in [cutlass.DataType.f16, cutlass.DataType.bf16]:
+                self.randomization_max = 2
+            else:
+                self.randomization_max = 3
+        else:
+            self.randomization_max = 7
+            
+        self.activation = plan.activation
+        
+        self.host_conv2d = cutlass_bindings.test.conv.host.conv2d
+            
+    
+    def set_seed(self):
+        if self.backend == "numpy":
+            np.random.seed(self.seed)
+        else:
+            torch.manual_seed(self.seed)
+    
+    def uniform_init(self, size, dtype):
+        if self.backend == "numpy":
+            return super().uniform_init(size, numpy_dtype[dtype])
+        else:
+            tensor = torch.ceil(
+                torch.empty(size=size, dtype=torch_dtype[dtype], device="cuda").uniform_(-self.randomization_max - 0.5, self.randomization_max - 0.5)
+            ).to(memory_format=torch.channels_last)
+            return tensor
+    
+    def zeros_like(self, tensor):
+        if self.backend == "numpy":
+            return np.zeros_like(tensor)
+        else:
+            return torch.zeros_like(tensor).to(memory_format=torch.channels_last)
+    
+    def reference(self, ps, A, B, C, alpha, beta, activation):
+        if self.backend == "numpy":
+            numpy_result = self.host_reference(ps, A, B, C, alpha, beta, activation)
+            return numpy_result
+        else:
+            if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
+                torch_result = alpha * torch.ops.aten.conv2d(
+                    A,
+                    B,
+                    stride=(ps.stride_h, ps.stride_w),
+                    padding=(ps.pad_h, ps.pad_w),
+                    dilation=(ps.dilation_h, ps.dilation_w)
+                ) + beta * C
+            elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
+                torch_result = alpha * torch.nn.grad.conv2d_input(
+                    (ps.N, ps.C, ps.H, ps.W),
+                    B,
+                    A,
+                    padding=(ps.pad_h, ps.pad_w),
+                    stride=(ps.stride_h, ps.stride_w)
+                ) + beta * C
+            elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
+                torch_result = alpha * torch.nn.grad.conv2d_weight(
+                    B,
+                    (ps.K, ps.C, ps.R, ps.S),
+                    A,
+                    padding=(ps.pad_h, ps.pad_w),
+                    stride=(ps.stride_h, ps.stride_w)
+                ) + beta * C
+            else:
+                raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
+            
+            if activation == cutlass.backend.epilogue.relu:
+                torch_result = torch.nn.functional.relu(torch_result)
+            elif activation == cutlass.backend.epilogue.leaky_relu:
+                torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
+            
+            return torch_result
+    
+    def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta, activation):
+        if self.element_compute == cutlass_bindings.float16:
+            alpha = cutlass_bindings.float16(alpha)
+            beta = cutlass_bindings.float16(beta)
+        elif self.element_compute == cutlass_bindings.int32:
+            alpha = int(alpha)
+            beta = int(beta)
+        else:
+            alpha = alpha
+            beta = beta
+
+        # If cached result is loaded
+        cached_result_loaded = False
+
+        if self.enable_cached_results:
+            # Get problem key
+            cached_test_key = cutlass_bindings.test.conv.host.CreateCachedConv2dTestKey(
+                self.conv_kind,
+                problem_size,
+                alpha,
+                beta,
+                getTensorView(
+                    tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
+                ),
+                getTensorView(
+                    tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
+                ),
+                getTensorView(
+                    tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
+                ),
+            )
+            
+            cached_test_key.problem = cached_test_key.problem + f"_{activation.tag.split('::')[-1]}"
+
+            cached_test_result = cutlass_bindings.test.conv.host.CachedTestResult()
+
+            conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (
+                self.operation.arch,
+                self.seed,
+            )
+
+            cached_results = cutlass_bindings.test.conv.host.CachedTestResultListing(
+                conv2d_result_cache_name
+            )
+            # CachedTestResultListing cached_results(conv2d_result_cache_name);
+            cached = cached_results.find(cached_test_key)
+            cached_result_loaded = cached[0]
+            if cached_result_loaded:
+                cached_test_result = cached[1]
+
+        if not cached_result_loaded:
+            # Compute the conv2d on host
+            tensor_D_ref = np.ones_like(tensor_C)
+            tensor_ref_A = getTensorRef(
+                tensor_A, self.layout_A, self.conv_kind, problem_size, "a"
+            )
+            tensor_ref_B = getTensorRef(
+                tensor_B, self.layout_B, self.conv_kind, problem_size, "b"
+            )
+            tensor_ref_C = getTensorRef(
+                tensor_C, self.layout_C, self.conv_kind, problem_size, "c"
+            )
+            tensor_ref_D_ref = getTensorRef(
+                tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
+            )
+
+            self.host_conv2d(
+                self.conv_kind,
+                problem_size,
+                tensor_ref_A,
+                tensor_ref_B,
+                tensor_ref_C,
+                tensor_ref_D_ref,
+                alpha,
+                beta,
+            )
+            
+            if activation == cutlass.backend.epilogue.leaky_relu:
+                tensor_D_ref = activation.numpy(tensor_D_ref, 0.5)
+            else:
+                tensor_D_ref = activation.numpy(tensor_D_ref)
+
+            tensor_view_D_ref = getTensorView(
+                tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d"
+            )
+
+            if self.enable_cached_results:
+                cached_test_result.D = cutlass_bindings.test.conv.host.TensorHash(
+                    tensor_view_D_ref
+                )
+                cached_results = (
+                    cutlass_bindings.test.conv.host.CachedTestResultListing(
+                        conv2d_result_cache_name
+                    )
+                )
+                cached_results.append(cached_test_key, cached_test_result)
+                cached_results.write(conv2d_result_cache_name)
+            else:
+                return tensor_D_ref
+
+        return cached_test_result.D
+    
+    def equal(self, tensor_D, tensor_D_ref, problem_size):
+        if self.backend == "numpy":
+            return super().equal(tensor_D, tensor_D_ref, problem_size)
+        else:
+            torch.cuda.synchronize()
+            return torch.equal(tensor_D, tensor_D_ref)
+                
+    
+    def run(self, ps, split_k_mode=cutlass_bindings.conv.SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
+        
+        #
+        # Initialize input and output tensors
+        #
+        if self.conv_kind == cutlass_bindings.conv.Operator.fprop:
+            if self.backend == "torch":
+                tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+                tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+                tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+            else:
+                tensor_A_size = (ps.N, ps.H, ps.W, ps.C)
+                tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
+                tensor_C_size = (ps.N, ps.P, ps.Q, ps.K)
+        elif self.conv_kind == cutlass_bindings.conv.Operator.dgrad:
+            if self.backend == "torch":
+                tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+                tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+                tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+            else:
+                tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
+                tensor_B_size = (ps.K, ps.R, ps.S, ps.C)
+                tensor_C_size = (ps.N, ps.H, ps.W, ps.C)
+        elif self.conv_kind == cutlass_bindings.conv.Operator.wgrad:
+            if self.backend == "torch":
+                tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+                tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+                tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+            else:
+                tensor_A_size = (ps.N, ps.P, ps.Q, ps.K)
+                tensor_B_size = (ps.N, ps.H, ps.W, ps.C)
+                tensor_C_size = (ps.K, ps.R, ps.S, ps.C)
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is not supported")
+
+        self.set_seed()
+
+        tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
+        tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
+        tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
+        tensor_D = self.zeros_like(tensor_C)
+        
+        self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D, 
+            stride=(ps.stride_h, ps.stride_w),
+            padding=(ps.pad_h, ps.pad_w),
+            dilation=(ps.dilation_h, ps.dilation_w),
+            alpha=alpha, beta=beta,
+            split_k=(split_k_mode, split_k_slices))
+        
+        tensor_D_ref = self.reference(
+            ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation
+        )
+        
+        return self.equal(tensor_D, tensor_D_ref, ps)
+
+
+def add_test(
+    cls, 
+    cc, 
+    conv_kind,
+    problem_sizes,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm=None,
+    swizzle=None,
+    split_k_mode="serial",
+    split_k_slices=1,
+    activation = "identity"
+):
+    """Create a test-running function with the given specification"""
+    test_name = get_name_conv2d(
+        cc, conv_kind, element, element_accumulator,
+        element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
+        iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
+    
+    def run(self):
+        # Create the plan
+        plan = cutlass.Conv2d(
+            kind=conv_kind,
+            element=element,
+            element_accumulator=element_accumulator,
+            element_C=element_output,
+            element_D=element_output
+        )
+        
+        # Set the opclass
+        plan.opclass = opclass
+        # Set the tile description
+        td = {
+            "threadblock_shape": threadblock_shape,
+            "warp_count": warp_count,
+            "stages": stages,
+            "instruction_shape": instruction_shape,
+        }
+
+        plan.tile_description = td
+        # Set iterator algorithm
+        if iterator_algorithm is not None:
+            plan.iterator_algorithm = iterator_algorithm
+        # Set swizzling functor
+        if swizzle is not None:
+            plan.swizzling_stride = swizzle
+        
+        if activation != "identity":
+            if activation == "leaky_relu":
+                plan.activation = (cutlass.epilogue.leaky_relu, 0.5)
+            else:
+                plan.activation = getattr(cutlass.epilogue, activation)
+        
+        conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="numpy")
+        
+        for ps in problem_sizes:
+            if not validate_problem_size(ps, conv_kind, split_k_slices): continue
+            
+            self.assertTrue(
+                conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 0.5)
+            )
+    
+    setattr(cls, test_name, run)
+    
+    return run
+
+
+def get_conv_problems():  
+    # 64: minimum channel size
+    conv_problems = list(cutlass_bindings.test.conv.TestbedConv2dProblemSizes(64).conv2d_default_sizes)
+    # Insert alignment 4 & 2 tests
+    conv_problems += [
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 4, 4, 12),
+            cutlass_bindings.Tensor4DCoord(8, 3, 3, 12),
+            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
+            cutlass_bindings.MatrixCoord(3, 3),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 4, 4, 14),
+            cutlass_bindings.Tensor4DCoord(8, 3, 3, 14),
+            cutlass_bindings.Tensor4DCoord(0, 0, 0, 0),
+            cutlass_bindings.MatrixCoord(3, 3),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+        cutlass_bindings.conv.Conv2dProblemSize(
+            cutlass_bindings.Tensor4DCoord(1, 23, 56, 98),
+            cutlass_bindings.Tensor4DCoord(128, 3, 3, 98),
+            cutlass_bindings.Tensor4DCoord(4, 0, 5, 0),
+            cutlass_bindings.MatrixCoord(3, 3),
+            cutlass_bindings.MatrixCoord(1, 1),
+            cutlass_bindings.conv.Mode.cross_correlation,
+            1, 1
+        ),
+    ]
+    
+    return conv_problems