Updates for 3.1 (#932)

This commit is contained in:
ANIKET SHIVAM
2023-04-29 06:34:27 -07:00
committed by GitHub
parent 6f8596ce3f
commit 7c04f95415
51 changed files with 1796 additions and 328 deletions

View File

@ -200,7 +200,7 @@ class ArtifactManager:
self.compiled_cache_host.insert(key, compiled_host_fns)
return True
def emit_compile_(self, operation_list, compilation_options, requires_nvcc_hostlib_compilation):
def emit_compile_(self, operation_list, compilation_options):
"""
Compile a list of kernels and store them into database
"""
@ -306,41 +306,17 @@ class ArtifactManager:
cubin_image = file.read()
# Set up the host-side library code
if requires_nvcc_hostlib_compilation:
cmd_template = (
"echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
% source_buffer_host
)
cmd = SubstituteTemplate(
cmd_template,
{
"cuda_install_path": CUDA_INSTALL_PATH,
"options": compilation_options.get_str(),
},
)
else:
options = compilation_options.get()
cmd = (
"echo '%s'|g++ -x c++ -fpermissive -w -fPIC -DCUTLASS_PYTHON_HOST_CC=1"
% source_buffer_host
)
filtered_opts = [
"-default-device",
"-Xcicc",
"-Xllc",
"--expt-relaxed-constexpr",
"-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored",
]
for opt in options:
opt = opt.decode("utf-8")
if opt not in filtered_opts and "-arch=sm_" not in opt:
if "--include-path=" in opt:
cmd += " " + opt.replace(
"--include-path=",
"-I",
)
else:
cmd += " " + opt
cmd_template = (
"echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
% source_buffer_host
)
cmd = SubstituteTemplate(
cmd_template,
{
"cuda_install_path": CUDA_INSTALL_PATH,
"options": compilation_options.get_str(),
},
)
tempfile.tempdir = "./"
temp = tempfile.NamedTemporaryFile(
@ -375,7 +351,6 @@ class ArtifactManager:
# save the cubin
operation_key = []
operation_list = []
requires_nvcc_hostlib_compilation = False
for operation in operations:
# step 1: get kernel string as key
key = operation.rt_module.emit() + operation.procedural_name() + self.backend
@ -398,17 +373,9 @@ class ArtifactManager:
operation_list.append(operation.rt_module)
operation_key.append(key)
# Creating the Params structures for certain 3.0 kernels currently requires CUDA. For these cases, use NVCC to generate
# the PyCUTLASS host-side library. Otherwise, g++ will be used.
if isinstance(operation, GemmOperationUniversal) and operation.api == ApiVersion.v3x:
if self.backend == "nvrtc":
raise RuntimeError("CUTLASS 3 kernels currently require NVCC for compilation.")
requires_nvcc_hostlib_compilation = True
if len(operation_list) > 0:
cubin_image, host_lib, host_file = self.emit_compile_(
operation_list, compile_options, requires_nvcc_hostlib_compilation)
operation_list, compile_options)
err, module = cuda.cuModuleLoadData(cubin_image)
if err != cuda.CUresult.CUDA_SUCCESS:

View File

@ -43,10 +43,10 @@
template<typename T, typename L, typename TF>
void bind_tensor_ref_view(py::module &m, std::string name) {
py::class_<cutlass::TensorRef<T, L>>(m, ("TensorRef" + name).c_str())
.def("__init__", [](cutlass::TensorRef<T, L>& tensor_ref, int64_t address, const L& layout_ ) {
.def(py::init([](int64_t address, const L& layout_ ) {
T* ptr = reinterpret_cast< T*>(address);
new (&tensor_ref) cutlass::TensorRef<T, L>(ptr, layout_);
})
return new cutlass::TensorRef<T, L>(ptr, layout_);
}))
.def("data", [](cutlass::TensorRef<T, L>& tensor_ref) {
T* ptr = tensor_ref.data();
return int64_t(ptr);