197 lines
7.0 KiB
Python
197 lines
7.0 KiB
Python
#################################################################################################
|
|
#
|
|
# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: BSD-3-Clause
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
#
|
|
# 1. Redistributions of source code must retain the above copyright notice, this
|
|
# list of conditions and the following disclaimer.
|
|
#
|
|
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
# this list of conditions and the following disclaimer in the documentation
|
|
# and/or other materials provided with the distribution.
|
|
#
|
|
# 3. Neither the name of the copyright holder nor the names of its
|
|
# contributors may be used to endorse or promote products derived from
|
|
# this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
#################################################################################################
|
|
|
|
"""
|
|
Profiler based on the cuda events
|
|
"""
|
|
|
|
import re
|
|
import subprocess
|
|
|
|
from cutlass_cppgen.utils.lazy_import import lazy_import
|
|
cuda = lazy_import("cuda.cuda")
|
|
cudart = lazy_import("cuda.cudart")
|
|
import numpy as np
|
|
|
|
from cutlass_cppgen import CUTLASS_PATH
|
|
from cutlass_cppgen.backend.library import DataTypeSize
|
|
from cutlass_cppgen.op.op import OperationBase
|
|
from cutlass_cppgen.shape import GemmCoord
|
|
from cutlass_cppgen.utils.datatypes import is_numpy_tensor
|
|
|
|
|
|
class GpuTimer:
|
|
def __init__(self) -> None:
|
|
self.events = [
|
|
cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
|
|
cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
|
|
]
|
|
|
|
def start(self, stream=None):
|
|
if not stream:
|
|
stream = cuda.CUstream(0)
|
|
|
|
(err,) = cuda.cuEventRecord(self.events[0], stream)
|
|
if err != cuda.CUresult.CUDA_SUCCESS:
|
|
raise RuntimeError(f"CUDA Error {str(err)}")
|
|
|
|
def stop(self, stream=None):
|
|
if not stream:
|
|
stream = cuda.CUstream(0)
|
|
|
|
(err,) = cuda.cuEventRecord(self.events[1], stream)
|
|
if err != cuda.CUresult.CUDA_SUCCESS:
|
|
raise RuntimeError(f"CUDA Error {str(err)}")
|
|
pass
|
|
|
|
def stop_and_wait(self, stream=None):
|
|
if not stream:
|
|
stream = cuda.CUstream(0)
|
|
|
|
self.stop(stream)
|
|
if stream:
|
|
(err,) = cuda.cuStreamSynchronize(stream)
|
|
if err != cuda.CUresult.CUDA_SUCCESS:
|
|
raise RuntimeError(f"CUDA Error {str(err)}")
|
|
else:
|
|
(err,) = cudart.cudaDeviceSynchronize()
|
|
if err != cuda.CUresult.CUDA_SUCCESS:
|
|
raise RuntimeError(f"CUDA Error {str(err)}")
|
|
|
|
def duration(self, iterations=1):
|
|
err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
|
|
if err != cuda.CUresult.CUDA_SUCCESS:
|
|
raise RuntimeError(f"CUDA Error {str(err)}")
|
|
return duration / float(iterations)
|
|
|
|
|
|
class CUDAEventProfiler:
|
|
def __init__(self, op: OperationBase, warmup_iterations: int=500, iterations: int=500, *args, **kwargs) -> None:
|
|
self.arguments = op.run(*args, **kwargs)
|
|
self.operation = op.operation
|
|
self.warmup_iterations = warmup_iterations
|
|
self.iterations = iterations
|
|
self.timer = GpuTimer()
|
|
|
|
#
|
|
# Cutlass Python Interface Profiler
|
|
#
|
|
|
|
def __call__(self):
|
|
for _ in range(self.warmup_iterations):
|
|
self.operation.run(self.arguments)
|
|
|
|
self.timer.start()
|
|
for _ in range(self.iterations):
|
|
self.operation.run(self.arguments)
|
|
|
|
self.timer.stop_and_wait()
|
|
runtime = self.timer.duration(self.iterations)
|
|
return runtime
|
|
|
|
#
|
|
# CUTLASS Profiler
|
|
#
|
|
|
|
def run_cutlass_profiler(self):
|
|
alpha = 1.0
|
|
beta = 1.0
|
|
|
|
profiler_path = CUTLASS_PATH + "/build/tools/profiler/cutlass_profiler"
|
|
kernel_name = self.operation.procedural_name()
|
|
verification_providers = "device"
|
|
provider = "cutlass"
|
|
problem_size = self.arguments.problem_size
|
|
|
|
if "cutlass3x" in kernel_name:
|
|
# cutlass3x generator only have column-major output
|
|
layout_name = self.operation.layout_name_3x()
|
|
if layout_name[-1] == "t":
|
|
new_layout_name = "".join(["n" for l in layout_name if l == "t" or "t"])
|
|
problem_size = GemmCoord(problem_size.n, problem_size.m, problem_size.k)
|
|
kernel_name = kernel_name.replace(layout_name, new_layout_name)
|
|
|
|
batch_count = self.arguments.batch_count
|
|
|
|
cmd = f"{profiler_path} --kernels={kernel_name} --verification-providers={verification_providers} " \
|
|
f"--providers={provider} --m={problem_size.m()} --n={problem_size.n()} --k={problem_size.k()} " \
|
|
f"--batch_count={batch_count} --alpha={alpha} --beta={beta} "\
|
|
f"--warmup-iterations={self.warmup_iterations} --profiling-iterations={self.iterations}"
|
|
|
|
result = subprocess.getoutput(cmd)
|
|
|
|
m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
|
|
runtime = float(m.group("runtime"))
|
|
|
|
m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
|
|
bytes = int(m.group("bytes"))
|
|
|
|
m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
|
|
flops = int(m.group("flops"))
|
|
|
|
# check if the problem size matches
|
|
assert bytes == self.bytes(problem_size, batch_count, beta)
|
|
assert flops == self.flops(problem_size, batch_count, beta)
|
|
|
|
return runtime
|
|
|
|
def bytes(self, problem_size, batch_count=1, beta=0.0):
|
|
m = problem_size.m()
|
|
n = problem_size.n()
|
|
k = problem_size.k()
|
|
|
|
bytes = (
|
|
(DataTypeSize[self.operation.A.element] * m // 8) * k
|
|
+ (DataTypeSize[self.operation.B.element] * n // 8) * k
|
|
+ (DataTypeSize[self.operation.C.element] * m // 8) * n
|
|
)
|
|
|
|
if beta != 0:
|
|
bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
|
|
|
|
bytes *= batch_count
|
|
|
|
return bytes
|
|
|
|
def flops(self, problem_size, batch_count=1, beta=0.0):
|
|
m = problem_size.m()
|
|
n = problem_size.n()
|
|
k = problem_size.k()
|
|
|
|
flops_ = (m * n * k) * 2 * batch_count
|
|
|
|
if beta != 0:
|
|
flops_ += m * n * batch_count * 2
|
|
|
|
return flops_
|
|
|