Files
cutlass/python/cutlass_cppgen/utils/profiler.py
2025-09-18 14:26:57 -04:00

197 lines
7.0 KiB
Python

#################################################################################################
#
# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
"""
Profiler based on the cuda events
"""
import re
import subprocess
from cutlass_cppgen.utils.lazy_import import lazy_import
cuda = lazy_import("cuda.cuda")
cudart = lazy_import("cuda.cudart")
import numpy as np
from cutlass_cppgen import CUTLASS_PATH
from cutlass_cppgen.backend.library import DataTypeSize
from cutlass_cppgen.op.op import OperationBase
from cutlass_cppgen.shape import GemmCoord
from cutlass_cppgen.utils.datatypes import is_numpy_tensor
class GpuTimer:
def __init__(self) -> None:
self.events = [
cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
]
def start(self, stream=None):
if not stream:
stream = cuda.CUstream(0)
(err,) = cuda.cuEventRecord(self.events[0], stream)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"CUDA Error {str(err)}")
def stop(self, stream=None):
if not stream:
stream = cuda.CUstream(0)
(err,) = cuda.cuEventRecord(self.events[1], stream)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"CUDA Error {str(err)}")
pass
def stop_and_wait(self, stream=None):
if not stream:
stream = cuda.CUstream(0)
self.stop(stream)
if stream:
(err,) = cuda.cuStreamSynchronize(stream)
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"CUDA Error {str(err)}")
else:
(err,) = cudart.cudaDeviceSynchronize()
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"CUDA Error {str(err)}")
def duration(self, iterations=1):
err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
if err != cuda.CUresult.CUDA_SUCCESS:
raise RuntimeError(f"CUDA Error {str(err)}")
return duration / float(iterations)
class CUDAEventProfiler:
def __init__(self, op: OperationBase, warmup_iterations: int=500, iterations: int=500, *args, **kwargs) -> None:
self.arguments = op.run(*args, **kwargs)
self.operation = op.operation
self.warmup_iterations = warmup_iterations
self.iterations = iterations
self.timer = GpuTimer()
#
# Cutlass Python Interface Profiler
#
def __call__(self):
for _ in range(self.warmup_iterations):
self.operation.run(self.arguments)
self.timer.start()
for _ in range(self.iterations):
self.operation.run(self.arguments)
self.timer.stop_and_wait()
runtime = self.timer.duration(self.iterations)
return runtime
#
# CUTLASS Profiler
#
def run_cutlass_profiler(self):
alpha = 1.0
beta = 1.0
profiler_path = CUTLASS_PATH + "/build/tools/profiler/cutlass_profiler"
kernel_name = self.operation.procedural_name()
verification_providers = "device"
provider = "cutlass"
problem_size = self.arguments.problem_size
if "cutlass3x" in kernel_name:
# cutlass3x generator only have column-major output
layout_name = self.operation.layout_name_3x()
if layout_name[-1] == "t":
new_layout_name = "".join(["n" for l in layout_name if l == "t" or "t"])
problem_size = GemmCoord(problem_size.n, problem_size.m, problem_size.k)
kernel_name = kernel_name.replace(layout_name, new_layout_name)
batch_count = self.arguments.batch_count
cmd = f"{profiler_path} --kernels={kernel_name} --verification-providers={verification_providers} " \
f"--providers={provider} --m={problem_size.m()} --n={problem_size.n()} --k={problem_size.k()} " \
f"--batch_count={batch_count} --alpha={alpha} --beta={beta} "\
f"--warmup-iterations={self.warmup_iterations} --profiling-iterations={self.iterations}"
result = subprocess.getoutput(cmd)
m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
runtime = float(m.group("runtime"))
m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
bytes = int(m.group("bytes"))
m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
flops = int(m.group("flops"))
# check if the problem size matches
assert bytes == self.bytes(problem_size, batch_count, beta)
assert flops == self.flops(problem_size, batch_count, beta)
return runtime
def bytes(self, problem_size, batch_count=1, beta=0.0):
m = problem_size.m()
n = problem_size.n()
k = problem_size.k()
bytes = (
(DataTypeSize[self.operation.A.element] * m // 8) * k
+ (DataTypeSize[self.operation.B.element] * n // 8) * k
+ (DataTypeSize[self.operation.C.element] * m // 8) * n
)
if beta != 0:
bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
bytes *= batch_count
return bytes
def flops(self, problem_size, batch_count=1, beta=0.0):
m = problem_size.m()
n = problem_size.n()
k = problem_size.k()
flops_ = (m * n * k) * 2 * batch_count
if beta != 0:
flops_ += m * n * batch_count * 2
return flops_