Rename python/cutlass to python/cutlass_cppgen (#2652)

This commit is contained in:
Jack Kosaian
2025-09-18 13:26:57 -05:00
committed by Haicheng Wu
parent 4260d4aef9
commit 177a82e251
71 changed files with 1 additions and 1 deletions

View File

@ -0,0 +1,41 @@
#################################################################################################
#
# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
from cutlass_cppgen.utils.check import (
alignment_or_default,
calculate_smem_usage,
calculate_smem_usage_per_stage,
valid_cluster_shape,
valid_schedule,
valid_stage_count,
update_alignment,
)

View File

@ -0,0 +1,262 @@
#################################################################################################
#
# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Utility functions for checking constraints on kernels and calculating kernel attributes
"""
import ctypes
from cutlass_library import DataTypeSize, KernelScheduleSuffixes, OperationKind, SharedMemPerCC
import cutlass_cppgen
from cutlass_cppgen.backend.library import TileDescription
def calculate_smem_usage_per_stage(td: TileDescription, operation_kind: OperationKind) -> int:
"""
Returns the amount of shared memory in bytes consumed in a single stage of a kernel.
:param td: tile description to compute shared memory of
:type td: TileDescription
:param operation_kind: identifier for the type of operation being performed
:type operation_kind: cutlass_library.OperationKind
:return: number of bytes of shared memory consumed by a single stage
:rtype: int
"""
m, n, k = td.blackwell_threadblock_shape
if td.is_2sm:
m //= 2
if operation_kind == OperationKind.Gemm:
stage_barrier_bytes = 32
return (
(DataTypeSize[td.math_instruction.element_a] * m * k // 8)
+ (DataTypeSize[td.math_instruction.element_b] * k * n // 8)
+ stage_barrier_bytes
)
else:
raise Exception(f"No available shared memory calculation for operation kind {operation.operation_kind}")
def calculate_smem_usage(operation) -> int:
"""
Returns the amount of shared memory in bytes consumed by a kernel.
:return: number of bytes of shared memory consumed by the operation
:return: int
"""
_per_stage = calculate_smem_usage_per_stage(operation.tile_description, operation.operation_kind)
return _per_stage * operation.tile_description.stages
def valid_stage_count(
cc: int,
kernel_cc: int,
td: TileDescription,
element_C: cutlass_cppgen.DataType = None,
element_D: cutlass_cppgen.DataType = None,
verbose: bool = True) -> tuple:
"""
Checks whether a device with `cc` supports the number of stages within `tile_description`, both
based on raw limits on the number of stages and based on shared memory capacity
:param cc: compute capability of device in question
:type cc: int
:param kernel_cc: compute capability that the kernel targets (corresponding to the arch::SMxy tag in CUTLASS)
:type kernel_cc: int
:param td: tile description to check
:type td: TileDescription
:param element_C: data type of operand C
:type element_C: cutlass_cppgen.DataType
:param element_D: data type of operand D
:type element_D: cutlass_cppgen.DataType
:param verbose: whether to log warnings
:type verbose: bool
:return: tuple with the first element indicating whether the provided tile description is
valid for the provided device and the second element being an error message
:rtype: tuple
"""
if kernel_cc in [90, 100, 101, 103]:
if (td.stages is None or td.stages == 0):
# Stage count of None or 0 for SM90 indicates that the CollectiveBuilder automatically
# determines the stage count to use. Thus, all settings are valid in these scenarios.
return (True, "")
elif verbose:
cutlass_cppgen.logger.warning(
"Setting an explicit stage count for SM90 kernels currently may "
"result in compilation errors if the combination of tile shape, "
"stage count, and shared memory requirement of the epilogue exceeds "
"the available shared memory per SM.")
if td.stages <= 0:
return (False, f"Stage counts must be positive integers. Tile description has stage count of {td.stages}.")
if cc < 80 and td.stages != 2:
return (False, f"Tile description has stage count of {td.stages}, "
f"but only 2 stages are supported on SM{cc}.")
# The calculation below does not consider shared memory used by the epilogue and, thus,
# only catches cases in which the mainloop exceeds the device's shared memory capacity.
# This is not a concern for CUTLASS 2.x kernels, for which the shared memory of the
# mainloop and epilogue is shared.
smem_per_stage = calculate_smem_usage_per_stage(td, OperationKind.Gemm)
smem_usage_mainloop = (smem_per_stage * td.stages)
smem_arch = SharedMemPerCC[cc] << 10
if smem_usage_mainloop > smem_arch:
return ( False,
"Configuration uses too much shared memory. Consider reducing stage count or tile shape.\n"
f"Details:\n"
f"Mainloop uses {smem_per_stage} bytes of shared memory per stage, and "
f"{td.stages} stages for a total of {smem_usage_mainloop} bytes.\n"
f"The maxmium amount of shared memory that can be used per block on CC {cc} is {smem_arch}.")
return (True, "")
def valid_cluster_shape(cc: int, cluster_shape: list) -> tuple:
"""
Checks whether a device with `cc` supports a thread block cluster of shape `cluster_shape`.
:param cc: compute capability of device in question
:type cc: int
:param cluster_shape: dimensions of thread block cluster shape to check
:type cluster_shape: list
:return: tuple with the first element indicating whether the provided cluster shape is
valid for the provided device and the second element being an error message
:rtype: tuple
"""
if cc < 90 or cc in [120, 121]:
if cluster_shape != [1, 1, 1]:
return (False,
f"Cluster shape for pre-SM90 architectures and SM 120 and 121 must be [1, 1, 1]. Received cluster shape of "
f"{cluster_shape} for SM{cc}.")
else:
return (True, "")
if len(cluster_shape) != 3:
return (False,
f"Cluster shapes must be rank-3. Received {cluster_shape} (rank {len(cluster_shape)}")
if cluster_shape[2] != 1:
return (False,
"CUTLASS kernels currently require the third dimension of cluster shape to be 1. "
f"Received cluster shape of {cluster_shape}.")
return (True, "")
def valid_schedule(
cc: int,
kernel_schedule: cutlass_cppgen.KernelScheduleType,
epilogue_schedule: cutlass_cppgen.EpilogueScheduleType,
tile_scheduler: cutlass_cppgen.TileSchedulerType) -> tuple:
"""
Checks that the kernel and epilogue schedules passed in are a valid combination for
a device of compute capability ``cc``.
:param cc: compute capability of device in question
:type cc: int
:param kernel_schedule: kernel schedule type
:type kernel_schedule: cutlass_cppgen.KernelScheduleType
:param epilogue_schedule: epilogue schedule type
:type epilogue_schedule: cutlass_cppgen.EpilogueScheduleType
:param tile_scheduler: tile scheduler type
:type tile_scheduler: cutlass_cppgen.TileSchedulerType
:return: tuple with the first element indicating whether the provided schedules are
valid for the provided device and the second element being an error message
:rtype: tuple
"""
kernel_auto = (kernel_schedule == cutlass_cppgen.KernelScheduleType.ScheduleAuto)
epilogue_auto = (epilogue_schedule == cutlass_cppgen.EpilogueScheduleType.ScheduleAuto)
tile_scheduler_default = (tile_scheduler == cutlass_cppgen.TileSchedulerType.Default)
if (cc < 90 or cc in [120, 121]) and not (kernel_auto and epilogue_auto and tile_scheduler_default):
return (False, "Non-default schedules are only supported on SM90 and beyond (excluding SM120 and SM121)")
if cc == 90 and ((kernel_auto and not epilogue_auto) or (not kernel_auto and epilogue_auto)):
return (False, "Kernel and epilogue schedules must either both be auto or neither be auto")
if not tile_scheduler_default:
cooperative_kernels = [cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative,
cutlass_cppgen.KernelScheduleType.CpAsyncWarpSpecializedCooperative]
if cc == 90 and (tile_scheduler == cutlass_cppgen.TileSchedulerType.StreamK) and (kernel_schedule not in cooperative_kernels):
return (False, "Stream-K tile scheduler is currently only supported with the cooperative kernel schedule")
return (True, "")
def alignment_or_default(alignment_provided: int, default_alignment: int) -> int:
"""
Returns `alignment_provided` if it is set, otherwise `default_alignment` and checks
that `alignment_provided` does not exceed `default_alignment`.
:param alignment_provided: alignment preference specified. Can be None.
:type alignment_provided: int
:param default_alignment: alignment to use if `alignment_provided` is None
:type default_alignment: int
:return: alignment to use
:rtype: int
"""
if alignment_provided is not None:
if alignment_provided > default_alignment:
raise Exception(f"Alignment {alignment_provided} exceeds the maximum supported of {default_alignment}.")
return alignment_provided
return default_alignment
def update_alignment(alignment_provided:int, default_alignment: int) -> int:
"""
Returns `alignment_provided` if it is set, otherwise `default_alignment` and checks
that `alignment_provided` does not exceed `default_alignment`.
:param alignment_provided: alignment preference specified. Can be None.
:type alignment_provided: int
:param default_alignment: alignment to use if `alignment_provided` is None
:type default_alignment: int
:return: alignment to use
:rtype: int
"""
if alignment_provided is not None:
if alignment_provided > default_alignment:
if alignment_provided % default_alignment == 0:
return default_alignment
raise Exception(f"Alignment {alignment_provided} exceeds the maximum supported of {default_alignment}.")
return alignment_provided
return default_alignment

View File

@ -0,0 +1,362 @@
#################################################################################################
#
# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Utility functions for converting between frontend datatypes and CUTLASS datatypes
"""
import cutlass_cppgen
from cutlass_library import (
DataTypeSize,
MathOperation,
MathInstruction
)
from cutlass_cppgen.backend.library import (
TileDescription,
)
bfloat16_available = None
cupy_available = None
numpy_available = None
torch_available = None
_library_to_cupy_dict = None
_library_to_numpy_dict = None
_library_to_torch_dict = None
_torch_to_library_dict = None
def is_numpy_available():
global numpy_available, _library_to_numpy_dict
if numpy_available is None:
try:
import numpy as np
numpy_available = True
_library_to_numpy_dict = {
cutlass_cppgen.DataType.f16: np.float16,
cutlass_cppgen.DataType.f32: np.float32,
cutlass_cppgen.DataType.f64: np.float64,
cutlass_cppgen.DataType.s8: np.int8,
cutlass_cppgen.DataType.s32: np.int32,
}
except ImportError:
numpy_available = False
_library_to_numpy_dict = {}
return numpy_available
def is_numpy_tensor(inp) -> bool:
if is_numpy_available():
import numpy as np
return isinstance(inp, np.ndarray)
return False
def numpy_library_type(inp) -> cutlass_cppgen.DataType:
if is_numpy_available():
import numpy as np
if inp == np.float16:
return cutlass_cppgen.DataType.f16
elif inp == np.float32:
return cutlass_cppgen.DataType.f32
elif inp == np.float64:
return cutlass_cppgen.DataType.f64
elif inp == np.int8:
return cutlass_cppgen.DataType.s8
elif inp == np.int32:
return cutlass_cppgen.DataType.s32
return None
def numpy_type(inp):
return _library_to_numpy_dict.get(inp, None)
def is_cupy_available():
global cupy_available
if cupy_available is None:
try:
import cupy as cp
cupy_available = True
_library_to_cupy_dict = {
cutlass_cppgen.DataType.f16: cp.float16,
cutlass_cppgen.DataType.f32: cp.float32,
cutlass_cppgen.DataType.f64: cp.float64,
cutlass_cppgen.DataType.s8: cp.int8,
cutlass_cppgen.DataType.s32: cp.int32,
}
except ImportError:
cupy_available = False
_library_to_cupy_dict = {}
return cupy_available
def is_cupy_tensor(inp) -> bool:
if is_cupy_available():
import cupy as cp
return isinstance(inp, cp.ndarray)
return False
def cupy_library_type(inp) -> cutlass_cppgen.DataType:
if is_cupy_available():
import cupy as cp
if inp == cp.float16:
return cutlass_cppgen.DataType.f16
elif inp == cp.float32:
return cutlass_cppgen.DataType.f32
elif inp == cp.float64:
return cutlass_cppgen.DataType.f64
return None
def cupy_type(inp):
return _library_to_cupy_dict.get(inp, None)
def is_torch_available():
global torch_available, _library_to_torch_dict, _torch_to_library_dict
if torch_available is None:
try:
import torch
torch_available = True
_torch_to_library_dict = {
torch.half: cutlass_cppgen.DataType.f16,
torch.float16: cutlass_cppgen.DataType.f16,
torch.bfloat16: cutlass_cppgen.DataType.bf16,
torch.float: cutlass_cppgen.DataType.f32,
torch.float32: cutlass_cppgen.DataType.f32,
torch.double: cutlass_cppgen.DataType.f64,
torch.float64: cutlass_cppgen.DataType.f64,
torch.int8: cutlass_cppgen.DataType.s8,
torch.int32: cutlass_cppgen.DataType.s32,
torch.uint8: cutlass_cppgen.DataType.u8,
}
_library_to_torch_dict = {
cutlass_cppgen.DataType.f16: torch.half,
cutlass_cppgen.DataType.f16: torch.float16,
cutlass_cppgen.DataType.bf16: torch.bfloat16,
cutlass_cppgen.DataType.f32: torch.float,
cutlass_cppgen.DataType.f32: torch.float32,
cutlass_cppgen.DataType.f64: torch.double,
cutlass_cppgen.DataType.f64: torch.float64,
cutlass_cppgen.DataType.s8: torch.int8,
cutlass_cppgen.DataType.s32: torch.int32,
cutlass_cppgen.DataType.u8: torch.uint8,
}
def possibly_add_type(torch_type_name, cutlass_type):
# Only try adding the type if the version of torch being used supports it
if hasattr(torch, torch_type_name):
torch_type = getattr(torch, torch_type_name)
_torch_to_library_dict[torch_type] = cutlass_type
_library_to_torch_dict[cutlass_type] = torch_type
possibly_add_type("float8_e4m3fn", cutlass_cppgen.DataType.e4m3)
possibly_add_type("float8_e5m2", cutlass_cppgen.DataType.e5m2)
except ImportError:
torch_available = False
_torch_to_library_dict = {}
_library_to_torch_dict = {}
return torch_available
def is_torch_tensor(inp) -> bool:
if is_torch_available():
import torch
return isinstance(inp, torch.Tensor)
return False
def torch_library_type(inp) -> cutlass_cppgen.DataType:
return _torch_to_library_dict.get(inp, None)
def torch_type(inp):
return _library_to_torch_dict.get(inp, None)
def is_bfloat16_available():
global bfloat16_available
if bfloat16_available is None:
try:
import bfloat16
bfloat16_available = True
except ImportError:
bfloat16_available = False
return bfloat16_available
def bfloat16_library_type(inp) -> cutlass_cppgen.DataType:
if is_bfloat16_available():
import bfloat16
if inp == bfloat16.bfloat16:
return cutlass_cppgen.DataType.bf16
def bfloat16_type(inp):
if is_bfloat16_available():
import bfloat16
if inp == cutlass_cppgen.DataType.bf16:
return bfloat16.bfloat16
def library_type(inp):
if inp in DataTypeSize:
return inp
for cvt_fn in [
bfloat16_library_type,
cupy_library_type,
numpy_library_type,
torch_library_type,
]:
out = cvt_fn(inp)
if out is not None:
return out
raise Exception(f"No available conversion from type {inp} to a library type.")
def _tensor_from_numpy(np_tensor):
dtype = library_type(np_tensor.dtype)
if np_tensor.flags.c_contiguous:
layout = cutlass_cppgen.LayoutType.RowMajor
elif np_tensor.flags.f_contiguous:
layout = cutlass_cppgen.LayoutType.ColumnMajor
return (dtype, layout)
def _tensor_from_torch(pt_tensor):
dtype = library_type(pt_tensor.dtype)
return (dtype, cutlass_cppgen.LayoutType.RowMajor)
def get_datatype_and_layout(tensor):
if (is_numpy_tensor(tensor) or is_cupy_tensor(tensor)):
return _tensor_from_numpy(tensor)
elif is_torch_tensor(tensor):
return _tensor_from_torch(tensor)
elif isinstance(tensor, float) or isinstance(tensor, int):
return (cutlass_cppgen.DataType.f32, cutlass_cppgen.LayoutType.RowMajor)
else:
raise Exception(f"Unable to convert tensor of type {type(tensor)} to Python-bound CUTLASS datatype and layout.")
def get_tensor_shape(tensor, op="GEMM"):
if (is_numpy_tensor(tensor) or is_cupy_tensor(tensor)):
return tensor.shape
elif is_torch_tensor(tensor):
size = tensor.size()
if op == "CONV":
# PyTorch Tensors have shape NCHW
return (size[0], size[2], size[3], size[1])
else:
return tuple(tensor.size())
elif isinstance(tensor, float) or isinstance(tensor, int):
return (1,)
else:
raise Exception(f"Unable to convert tensor of type {type(tensor)} to Python-bound CUTLASS datatype and layout.")
_math_operation_value_map = {x.value: x for x in MathOperation}
def backend_math_operation(math_op: MathOperation):
if math_op.value not in _math_operation_value_map.keys():
raise Exception(f"Unable to convert math operation of type {math_op} to backend math operation.")
return _math_operation_value_map[math_op.value]
def construct_backend_td(td: cutlass_cppgen.TileDescription,
kernel_schedule: cutlass_cppgen.KernelScheduleType,
epilogue_schedule: cutlass_cppgen.EpilogueScheduleType,
tile_scheduler: cutlass_cppgen.TileSchedulerType) -> TileDescription:
mi = td.math_instruction
backend_mi = MathInstruction(
mi.instruction_shape,
mi.element_a,
mi.element_b,
mi.element_accumulator,
mi.opcode_class,
backend_math_operation(mi.math_operation)
)
cluster_shape = td.cluster_shape if hasattr(td, "cluster_shape") else [1, 1, 1]
return TileDescription(td.threadblock_shape, td.stages, td.warp_count,
backend_mi, cluster_shape, kernel_schedule, epilogue_schedule, tile_scheduler)
def td_from_profiler_op(op) -> TileDescription:
"""
Converts the profiler's TileDescription in ``op`` into the backend TileDescription
:param op: profiler Operation
:returns: backend TileDescription
:rtype: cutlass_cppgen.backend.TileDescription
"""
kschedule = op.kernel_schedule if hasattr(op, 'kernel_schedule') else None
eschedule = op.epilogue_schedule if hasattr(op, 'epilogue_schedule') else None
tschedule = op.tile_scheduler if hasattr(op, 'tile_scheduler') else None
return construct_backend_td(op.tile_description, kschedule, eschedule, tschedule)
def td_from_profiler_td(td: TileDescription) -> TileDescription:
"""
Converts the profiler's TileDescription into the backend TileDescription
:param td: profiler TileDescription
:type td: cutlass_cppgen.TileDescription
:returns: backend TileDescription
:rtype: cutlass_cppgen.backend.TileDescription
"""
return construct_backend_td(td, kernel_schedule=None, epilogue_schedule=None, tile_scheduler=None)
def to_camel_case(snake_str):
return "".join(x.capitalize() for x in snake_str.lower().split("_"))
def getattr_enum(obj, attr_name):
# The attr_name is under the snake_case
camel_attr = to_camel_case(attr_name)
if hasattr(obj, camel_attr):
return getattr(obj, camel_attr)
else:
raise Exception(f"Invalid option: {attr_name}")

View File

@ -0,0 +1,41 @@
#################################################################################################
#
# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import importlib
from typing import Any
def lazy_import(mod_name: str) -> Any:
class Lazy:
def __getattr__(self, name:str) -> Any:
module = importlib.import_module(mod_name)
return getattr(module, name)
return Lazy()

View File

@ -0,0 +1,196 @@
#################################################################################################
#
# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Profiler based on the cuda events
"""
import re
import subprocess
from cutlass_cppgen.utils.lazy_import import lazy_import
cuda = lazy_import("cuda.cuda")
cudart = lazy_import("cuda.cudart")
import numpy as np
from cutlass_cppgen import CUTLASS_PATH
from cutlass_cppgen.backend.library import DataTypeSize
from cutlass_cppgen.op.op import OperationBase
from cutlass_cppgen.shape import GemmCoord
from cutlass_cppgen.utils.datatypes import is_numpy_tensor
class GpuTimer:
def __init__(self) -> None:
self.events = [
cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
]
def start(self, stream=None):
if not stream:
stream = cuda.CUstream(0)
(err,) = cuda.cuEventRecord(self.events[0], stream)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"CUDA Error {str(err)}")
def stop(self, stream=None):
if not stream:
stream = cuda.CUstream(0)
(err,) = cuda.cuEventRecord(self.events[1], stream)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"CUDA Error {str(err)}")
pass
def stop_and_wait(self, stream=None):
if not stream:
stream = cuda.CUstream(0)
self.stop(stream)
if stream:
(err,) = cuda.cuStreamSynchronize(stream)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"CUDA Error {str(err)}")
else:
(err,) = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"CUDA Error {str(err)}")
def duration(self, iterations=1):
err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"CUDA Error {str(err)}")
return duration / float(iterations)
class CUDAEventProfiler:
def __init__(self, op: OperationBase, warmup_iterations: int=500, iterations: int=500, *args, **kwargs) -> None:
self.arguments = op.run(*args, **kwargs)
self.operation = op.operation
self.warmup_iterations = warmup_iterations
self.iterations = iterations
self.timer = GpuTimer()
#
# Cutlass Python Interface Profiler
#
def __call__(self):
for _ in range(self.warmup_iterations):
self.operation.run(self.arguments)
self.timer.start()
for _ in range(self.iterations):
self.operation.run(self.arguments)
self.timer.stop_and_wait()
runtime = self.timer.duration(self.iterations)
return runtime
#
# CUTLASS Profiler
#
def run_cutlass_profiler(self):
alpha = 1.0
beta = 1.0
profiler_path = CUTLASS_PATH + "/build/tools/profiler/cutlass_profiler"
kernel_name = self.operation.procedural_name()
verification_providers = "device"
provider = "cutlass"
problem_size = self.arguments.problem_size
if "cutlass3x" in kernel_name:
# cutlass3x generator only have column-major output
layout_name = self.operation.layout_name_3x()
if layout_name[-1] == "t":
new_layout_name = "".join(["n" for l in layout_name if l == "t" or "t"])
problem_size = GemmCoord(problem_size.n, problem_size.m, problem_size.k)
kernel_name = kernel_name.replace(layout_name, new_layout_name)
batch_count = self.arguments.batch_count
cmd = f"{profiler_path} --kernels={kernel_name} --verification-providers={verification_providers} " \
f"--providers={provider} --m={problem_size.m()} --n={problem_size.n()} --k={problem_size.k()} " \
f"--batch_count={batch_count} --alpha={alpha} --beta={beta} "\
f"--warmup-iterations={self.warmup_iterations} --profiling-iterations={self.iterations}"
result = subprocess.getoutput(cmd)
m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
runtime = float(m.group("runtime"))
m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
bytes = int(m.group("bytes"))
m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
flops = int(m.group("flops"))
# check if the problem size matches
assert bytes == self.bytes(problem_size, batch_count, beta)
assert flops == self.flops(problem_size, batch_count, beta)
return runtime
def bytes(self, problem_size, batch_count=1, beta=0.0):
m = problem_size.m()
n = problem_size.n()
k = problem_size.k()
bytes = (
(DataTypeSize[self.operation.A.element] * m // 8) * k
+ (DataTypeSize[self.operation.B.element] * n // 8) * k
+ (DataTypeSize[self.operation.C.element] * m // 8) * n
)
if beta != 0:
bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
bytes *= batch_count
return bytes
def flops(self, problem_size, batch_count=1, beta=0.0):
m = problem_size.m()
n = problem_size.n()
k = problem_size.k()
flops_ = (m * n * k) * 2 * batch_count
if beta != 0:
flops_ += m * n * batch_count * 2
return flops_