Rename python/cutlass to python/cutlass_cppgen (#2652)

2025-09-18 13:26:57 -05:00
parent 4260d4aef9
commit 177a82e251
71 changed files with 1 additions and 1 deletions
--- a/python/cutlass_cppgen/backend/operation.py
+++ b/python/cutlass_cppgen/backend/operation.py
@ -0,0 +1,140 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ctypes
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+
+from cutlass_cppgen.backend.utils.device import device_cc
+
+_supports_cluster_launch = None
+
+
+def supports_cluster_launch():
+    from cuda import __version__ 
+    _version_splits = [int(x) for x in __version__.split("rc")[0].split(".post")[0].split(".")]
+    global _supports_cluster_launch
+    if _supports_cluster_launch is None:
+        major, minor = _version_splits[0], _version_splits[1]
+        _supports_cluster_launch = device_cc() in [90, 100, 101, 103] and (major > 11 or (major == 11 and minor >= 8))
+    return _supports_cluster_launch
+
+
+class LaunchConfiguration:
+    def __init__(self, grid=[1, 1, 1], block=[1, 1, 1], smem=0):
+        self.grid = grid
+        self.block = block
+        self.shared_memory_capacity = smem
+
+
+class ExecutableOperation:
+    def __init__(self, operation):
+        self.operation = operation
+        self.module = None
+        self.kernel = None
+
+    def name(self):
+        return self.operation.procedural_name()
+
+    def emit(self):
+        return ""
+
+    def can_implement(self, configuration, arguments):
+        raise NotImplementedError()
+
+    def get_host_workspace_size(self, arguments):
+        raise NotImplementedError()
+
+    def get_device_workspace_size(self, arguments):
+        raise NotImplementedError()
+
+    def plan(self, arguments):
+        raise NotImplementedError()
+
+    def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream=None):
+        raise NotImplementedError()
+
+    def run_with_clusters(self, launch_config, kernel_params, stream=None):
+        if not stream:
+            stream = cuda.CUstream(0)
+        if hasattr(self.operation, "tile_description") and hasattr(self.operation.tile_description, "cluster_shape"):
+            attr = cuda.CUlaunchAttribute()
+            attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.operation.tile_description.cluster_shape
+            attr.id = cuda.CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+            attrs = [attr]
+
+            # Allow for non-portable cluster sizes
+            err, = cuda.cuFuncSetAttribute(
+                self.kernel, cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                return err
+        else:
+            attrs = []
+
+        config = cuda.CUlaunchConfig()
+        config.gridDimX, config.gridDimY, config.gridDimZ = launch_config.grid
+        config.blockDimX, config.blockDimY, config.blockDimZ = launch_config.block
+        config.blockDimZ = launch_config.block[2]
+        config.sharedMemBytes = launch_config.shared_memory_capacity
+        config.hStream = stream
+        config.attrs = attrs
+        config.numAttrs = len(attrs)
+
+        err, = cuda.cuLaunchKernelEx(
+            config, f=self.kernel, kernelParams=kernel_params, extra=0)
+        return err
+
+    def run_without_clusters(self, launch_config, kernel_params, stream=None):
+        if not stream:
+            stream = cuda.CUstream(0)
+        err, = cuda.cuLaunchKernel(
+            self.kernel,
+            launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
+            launch_config.block[0], launch_config.block[1], launch_config.block[2],
+            launch_config.shared_memory_capacity,
+            stream,
+            kernel_params,
+            0)
+
+        return err
+
+    def run(self, host_workspace, device_workspace, launch_config, stream=None):
+        if not stream:
+            stream = cuda.CUstream(0)
+        cArg = (ctypes.c_char * len(host_workspace)).from_buffer(host_workspace)
+        packed = (ctypes.c_void_p * 1)()
+        packed[0] = ctypes.addressof(cArg)
+
+        if supports_cluster_launch():
+            return self.run_with_clusters(launch_config, packed, stream)
+        else:
+            return self.run_without_clusters(launch_config, packed, stream)