################################################################################################# # # Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # 3. Neither the name of the copyright holder nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################################# """ Profiler based on the cuda events """ import re import subprocess from cutlass_cppgen.utils.lazy_import import lazy_import cuda = lazy_import("cuda.cuda") cudart = lazy_import("cuda.cudart") import numpy as np from cutlass_cppgen import CUTLASS_PATH from cutlass_cppgen.backend.library import DataTypeSize from cutlass_cppgen.op.op import OperationBase from cutlass_cppgen.shape import GemmCoord from cutlass_cppgen.utils.datatypes import is_numpy_tensor class GpuTimer: def __init__(self) -> None: self.events = [ cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1], cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1], ] def start(self, stream=None): if not stream: stream = cuda.CUstream(0) (err,) = cuda.cuEventRecord(self.events[0], stream) if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError(f"CUDA Error {str(err)}") def stop(self, stream=None): if not stream: stream = cuda.CUstream(0) (err,) = cuda.cuEventRecord(self.events[1], stream) if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError(f"CUDA Error {str(err)}") pass def stop_and_wait(self, stream=None): if not stream: stream = cuda.CUstream(0) self.stop(stream) if stream: (err,) = cuda.cuStreamSynchronize(stream) if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError(f"CUDA Error {str(err)}") else: (err,) = cudart.cudaDeviceSynchronize() if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError(f"CUDA Error {str(err)}") def duration(self, iterations=1): err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1]) if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError(f"CUDA Error {str(err)}") return duration / float(iterations) class CUDAEventProfiler: def __init__(self, op: OperationBase, warmup_iterations: int=500, iterations: int=500, *args, **kwargs) -> None: self.arguments = op.run(*args, **kwargs) self.operation = op.operation self.warmup_iterations = warmup_iterations self.iterations = iterations self.timer = GpuTimer() # # Cutlass Python Interface Profiler # def __call__(self): for _ in range(self.warmup_iterations): self.operation.run(self.arguments) self.timer.start() for _ in range(self.iterations): self.operation.run(self.arguments) self.timer.stop_and_wait() runtime = self.timer.duration(self.iterations) return runtime # # CUTLASS Profiler # def run_cutlass_profiler(self): alpha = 1.0 beta = 1.0 profiler_path = CUTLASS_PATH + "/build/tools/profiler/cutlass_profiler" kernel_name = self.operation.procedural_name() verification_providers = "device" provider = "cutlass" problem_size = self.arguments.problem_size if "cutlass3x" in kernel_name: # cutlass3x generator only have column-major output layout_name = self.operation.layout_name_3x() if layout_name[-1] == "t": new_layout_name = "".join(["n" for l in layout_name if l == "t" or "t"]) problem_size = GemmCoord(problem_size.n, problem_size.m, problem_size.k) kernel_name = kernel_name.replace(layout_name, new_layout_name) batch_count = self.arguments.batch_count cmd = f"{profiler_path} --kernels={kernel_name} --verification-providers={verification_providers} " \ f"--providers={provider} --m={problem_size.m()} --n={problem_size.n()} --k={problem_size.k()} " \ f"--batch_count={batch_count} --alpha={alpha} --beta={beta} "\ f"--warmup-iterations={self.warmup_iterations} --profiling-iterations={self.iterations}" result = subprocess.getoutput(cmd) m = re.search(r"Runtime:\s+(?P\d+.\d+)", result) runtime = float(m.group("runtime")) m = re.search(r"Bytes:\s+(?P\d+)", result) bytes = int(m.group("bytes")) m = re.search(r"FLOPs:\s+(?P\d+)", result) flops = int(m.group("flops")) # check if the problem size matches assert bytes == self.bytes(problem_size, batch_count, beta) assert flops == self.flops(problem_size, batch_count, beta) return runtime def bytes(self, problem_size, batch_count=1, beta=0.0): m = problem_size.m() n = problem_size.n() k = problem_size.k() bytes = ( (DataTypeSize[self.operation.A.element] * m // 8) * k + (DataTypeSize[self.operation.B.element] * n // 8) * k + (DataTypeSize[self.operation.C.element] * m // 8) * n ) if beta != 0: bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n bytes *= batch_count return bytes def flops(self, problem_size, batch_count=1, beta=0.0): m = problem_size.m() n = problem_size.n() k = problem_size.k() flops_ = (m * n * k) * 2 * batch_count if beta != 0: flops_ += m * n * batch_count * 2 return flops_