Updates for 3.1 (#932)

This commit is contained in:
ANIKET SHIVAM
2023-04-29 06:34:27 -07:00
committed by GitHub
parent 6f8596ce3f
commit 7c04f95415
51 changed files with 1796 additions and 328 deletions

View File

@ -200,7 +200,7 @@ class ArtifactManager:
self.compiled_cache_host.insert(key, compiled_host_fns)
return True
def emit_compile_(self, operation_list, compilation_options, requires_nvcc_hostlib_compilation):
def emit_compile_(self, operation_list, compilation_options):
"""
Compile a list of kernels and store them into database
"""
@ -306,41 +306,17 @@ class ArtifactManager:
cubin_image = file.read()
# Set up the host-side library code
if requires_nvcc_hostlib_compilation:
cmd_template = (
"echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
% source_buffer_host
)
cmd = SubstituteTemplate(
cmd_template,
{
"cuda_install_path": CUDA_INSTALL_PATH,
"options": compilation_options.get_str(),
},
)
else:
options = compilation_options.get()
cmd = (
"echo '%s'|g++ -x c++ -fpermissive -w -fPIC -DCUTLASS_PYTHON_HOST_CC=1"
% source_buffer_host
)
filtered_opts = [
"-default-device",
"-Xcicc",
"-Xllc",
"--expt-relaxed-constexpr",
"-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored",
]
for opt in options:
opt = opt.decode("utf-8")
if opt not in filtered_opts and "-arch=sm_" not in opt:
if "--include-path=" in opt:
cmd += " " + opt.replace(
"--include-path=",
"-I",
)
else:
cmd += " " + opt
cmd_template = (
"echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
% source_buffer_host
)
cmd = SubstituteTemplate(
cmd_template,
{
"cuda_install_path": CUDA_INSTALL_PATH,
"options": compilation_options.get_str(),
},
)
tempfile.tempdir = "./"
temp = tempfile.NamedTemporaryFile(
@ -375,7 +351,6 @@ class ArtifactManager:
# save the cubin
operation_key = []
operation_list = []
requires_nvcc_hostlib_compilation = False
for operation in operations:
# step 1: get kernel string as key
key = operation.rt_module.emit() + operation.procedural_name() + self.backend
@ -398,17 +373,9 @@ class ArtifactManager:
operation_list.append(operation.rt_module)
operation_key.append(key)
# Creating the Params structures for certain 3.0 kernels currently requires CUDA. For these cases, use NVCC to generate
# the PyCUTLASS host-side library. Otherwise, g++ will be used.
if isinstance(operation, GemmOperationUniversal) and operation.api == ApiVersion.v3x:
if self.backend == "nvrtc":
raise RuntimeError("CUTLASS 3 kernels currently require NVCC for compilation.")
requires_nvcc_hostlib_compilation = True
if len(operation_list) > 0:
cubin_image, host_lib, host_file = self.emit_compile_(
operation_list, compile_options, requires_nvcc_hostlib_compilation)
operation_list, compile_options)
err, module = cuda.cuModuleLoadData(cubin_image)
if err != cuda.CUresult.CUDA_SUCCESS:

View File

@ -43,10 +43,10 @@
template<typename T, typename L, typename TF>
void bind_tensor_ref_view(py::module &m, std::string name) {
py::class_<cutlass::TensorRef<T, L>>(m, ("TensorRef" + name).c_str())
.def("__init__", [](cutlass::TensorRef<T, L>& tensor_ref, int64_t address, const L& layout_ ) {
.def(py::init([](int64_t address, const L& layout_ ) {
T* ptr = reinterpret_cast< T*>(address);
new (&tensor_ref) cutlass::TensorRef<T, L>(ptr, layout_);
})
return new cutlass::TensorRef<T, L>(ptr, layout_);
}))
.def("data", [](cutlass::TensorRef<T, L>& tensor_ref) {
T* ptr = tensor_ref.data();
return int64_t(ptr);

View File

@ -29,9 +29,12 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import copy
import os
from pybind11.setup_helpers import Pybind11Extension
import setuptools
from setuptools import setup
from setuptools.command.build_ext import build_ext
def _cutlass_path_from_dir() -> str:
@ -61,31 +64,57 @@ cutlass_path = (
else _cutlass_path_from_dir()
)
cuda_install_path = (
os.getenv('CUDA_INSTALL_PATH')
if os.getenv('CUDA_INSTALL_PATH') is not None
else _cuda_install_path_from_nvcc()
)
ext_modules = []
try:
from pybind11.setup_helpers import Pybind11Extension, build_ext
include_dirs = [
cutlass_path + '/include',
cuda_install_path + '/include',
cutlass_path + '/tools/util/include',
cutlass_path + '/test',
]
class BuildExtension(build_ext):
"""
Wrapper around `build_ext` to use NVCC when compiling the CUTLASS Python-C++ bindings.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
ext_modules = [
Pybind11Extension('cutlass_bindings',
['cutlass/cpp/cutlass_bindings.cpp'],
include_dirs=include_dirs,
extra_compile_args=['-fpermissive', '-w', '-std=c++17', '-DCUTLASS_PYTHON_HOST_CC=1'])
]
except ImportError:
pass
def build_extensions(self):
original_compile = self.compiler._compile
def custom_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
"""
Wrapper around build_ext.compiler._compile method
"""
postargs = copy.deepcopy(extra_postargs)
postargs = [f for f in postargs if f not in ['-g0', '-fvisibility=hidden']]
postargs.extend(["-Xcompiler='-fPIC'", "-Xcompiler='-g0'", "-Xcompiler='-O3'", '-x', 'cu'])
try:
original_compiler = self.compiler.compiler_so
self.compiler.set_executable('compiler_so', [f'{cuda_install_path}/bin/nvcc'])
original_compile(obj, src, ext, cc_args, postargs, pp_opts)
finally:
self.compiler.set_executable('compiler_so', original_compiler)
self.compiler._compile = custom_compile
super().build_extensions()
include_dirs = [
cutlass_path + '/include',
cuda_install_path + '/include',
cutlass_path + '/tools/util/include',
cutlass_path + '/test',
]
ext_modules = [
Pybind11Extension('cutlass_bindings',
['cutlass/cpp/cutlass_bindings.cpp'],
include_dirs=include_dirs,
extra_compile_args=['-Xcompiler="-fpermissive"', '-w', '-std=c++17'],
libraries=['cudart'])
]
setup(
@ -103,4 +132,7 @@ setup(
'treelib'
],
ext_modules=ext_modules,
cmdclass={
'build_ext': BuildExtension
}
)