Rename python/cutlass to python/cutlass_cppgen (#2652)
This commit is contained in:
committed by
Haicheng Wu
parent
4260d4aef9
commit
177a82e251
140
python/cutlass_cppgen/backend/operation.py
Normal file
140
python/cutlass_cppgen/backend/operation.py
Normal file
@ -0,0 +1,140 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import ctypes
|
||||
from cutlass_cppgen.utils.lazy_import import lazy_import
|
||||
cuda = lazy_import("cuda.cuda")
|
||||
|
||||
from cutlass_cppgen.backend.utils.device import device_cc
|
||||
|
||||
_supports_cluster_launch = None
|
||||
|
||||
|
||||
def supports_cluster_launch():
|
||||
from cuda import __version__
|
||||
_version_splits = [int(x) for x in __version__.split("rc")[0].split(".post")[0].split(".")]
|
||||
global _supports_cluster_launch
|
||||
if _supports_cluster_launch is None:
|
||||
major, minor = _version_splits[0], _version_splits[1]
|
||||
_supports_cluster_launch = device_cc() in [90, 100, 101, 103] and (major > 11 or (major == 11 and minor >= 8))
|
||||
return _supports_cluster_launch
|
||||
|
||||
|
||||
class LaunchConfiguration:
|
||||
def __init__(self, grid=[1, 1, 1], block=[1, 1, 1], smem=0):
|
||||
self.grid = grid
|
||||
self.block = block
|
||||
self.shared_memory_capacity = smem
|
||||
|
||||
|
||||
class ExecutableOperation:
|
||||
def __init__(self, operation):
|
||||
self.operation = operation
|
||||
self.module = None
|
||||
self.kernel = None
|
||||
|
||||
def name(self):
|
||||
return self.operation.procedural_name()
|
||||
|
||||
def emit(self):
|
||||
return ""
|
||||
|
||||
def can_implement(self, configuration, arguments):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_host_workspace_size(self, arguments):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_device_workspace_size(self, arguments):
|
||||
raise NotImplementedError()
|
||||
|
||||
def plan(self, arguments):
|
||||
raise NotImplementedError()
|
||||
|
||||
def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream=None):
|
||||
raise NotImplementedError()
|
||||
|
||||
def run_with_clusters(self, launch_config, kernel_params, stream=None):
|
||||
if not stream:
|
||||
stream = cuda.CUstream(0)
|
||||
if hasattr(self.operation, "tile_description") and hasattr(self.operation.tile_description, "cluster_shape"):
|
||||
attr = cuda.CUlaunchAttribute()
|
||||
attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.operation.tile_description.cluster_shape
|
||||
attr.id = cuda.CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
|
||||
attrs = [attr]
|
||||
|
||||
# Allow for non-portable cluster sizes
|
||||
err, = cuda.cuFuncSetAttribute(
|
||||
self.kernel, cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
return err
|
||||
else:
|
||||
attrs = []
|
||||
|
||||
config = cuda.CUlaunchConfig()
|
||||
config.gridDimX, config.gridDimY, config.gridDimZ = launch_config.grid
|
||||
config.blockDimX, config.blockDimY, config.blockDimZ = launch_config.block
|
||||
config.blockDimZ = launch_config.block[2]
|
||||
config.sharedMemBytes = launch_config.shared_memory_capacity
|
||||
config.hStream = stream
|
||||
config.attrs = attrs
|
||||
config.numAttrs = len(attrs)
|
||||
|
||||
err, = cuda.cuLaunchKernelEx(
|
||||
config, f=self.kernel, kernelParams=kernel_params, extra=0)
|
||||
return err
|
||||
|
||||
def run_without_clusters(self, launch_config, kernel_params, stream=None):
|
||||
if not stream:
|
||||
stream = cuda.CUstream(0)
|
||||
err, = cuda.cuLaunchKernel(
|
||||
self.kernel,
|
||||
launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
|
||||
launch_config.block[0], launch_config.block[1], launch_config.block[2],
|
||||
launch_config.shared_memory_capacity,
|
||||
stream,
|
||||
kernel_params,
|
||||
0)
|
||||
|
||||
return err
|
||||
|
||||
def run(self, host_workspace, device_workspace, launch_config, stream=None):
|
||||
if not stream:
|
||||
stream = cuda.CUstream(0)
|
||||
cArg = (ctypes.c_char * len(host_workspace)).from_buffer(host_workspace)
|
||||
packed = (ctypes.c_void_p * 1)()
|
||||
packed[0] = ctypes.addressof(cArg)
|
||||
|
||||
if supports_cluster_launch():
|
||||
return self.run_with_clusters(launch_config, packed, stream)
|
||||
else:
|
||||
return self.run_without_clusters(launch_config, packed, stream)
|
||||
Reference in New Issue
Block a user