Updates for 3.1 (#932)

2023-04-29 06:34:27 -07:00
parent 6f8596ce3f
commit 7c04f95415
51 changed files with 1796 additions and 328 deletions
--- a/python/cutlass/backend/compiler.py
+++ b/python/cutlass/backend/compiler.py
@ -200,7 +200,7 @@ class ArtifactManager:
            self.compiled_cache_host.insert(key, compiled_host_fns)
        return True

-    def emit_compile_(self, operation_list, compilation_options, requires_nvcc_hostlib_compilation):
+    def emit_compile_(self, operation_list, compilation_options):
        """
        Compile a list of kernels and store them into database
        """
@ -306,41 +306,17 @@ class ArtifactManager:
                cubin_image = file.read()

        # Set up the host-side library code
-        if requires_nvcc_hostlib_compilation:
-            cmd_template = (
-                "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
-                % source_buffer_host
-            )
-            cmd = SubstituteTemplate(
-                cmd_template,
-                {
-                    "cuda_install_path": CUDA_INSTALL_PATH,
-                    "options": compilation_options.get_str(),
-                },
-            )
-        else:
-            options = compilation_options.get()
-            cmd = (
-                "echo '%s'|g++ -x c++ -fpermissive -w -fPIC -DCUTLASS_PYTHON_HOST_CC=1"
-                % source_buffer_host
-            )
-            filtered_opts = [
-                "-default-device",
-                "-Xcicc",
-                "-Xllc",
-                "--expt-relaxed-constexpr",
-                "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored",
-            ]
-            for opt in options:
-                opt = opt.decode("utf-8")
-                if opt not in filtered_opts and "-arch=sm_" not in opt:
-                    if "--include-path=" in opt:
-                        cmd += " " + opt.replace(
-                            "--include-path=",
-                            "-I",
-                        )
-                    else:
-                        cmd += " " + opt
+        cmd_template = (
+            "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
+            % source_buffer_host
+        )
+        cmd = SubstituteTemplate(
+            cmd_template,
+            {
+                "cuda_install_path": CUDA_INSTALL_PATH,
+                "options": compilation_options.get_str(),
+            },
+        )

        tempfile.tempdir = "./"
        temp = tempfile.NamedTemporaryFile(
@ -375,7 +351,6 @@ class ArtifactManager:
        # save the cubin
        operation_key = []
        operation_list = []
-        requires_nvcc_hostlib_compilation = False
        for operation in operations:
            # step 1: get kernel string as key
            key = operation.rt_module.emit() + operation.procedural_name() + self.backend
@ -398,17 +373,9 @@ class ArtifactManager:
                operation_list.append(operation.rt_module)
                operation_key.append(key)

-            # Creating the Params structures for certain 3.0 kernels currently requires CUDA. For these cases, use NVCC to generate
-            # the PyCUTLASS host-side library. Otherwise, g++ will be used.
-            if isinstance(operation, GemmOperationUniversal) and operation.api == ApiVersion.v3x:
-                if self.backend == "nvrtc":
-                    raise RuntimeError("CUTLASS 3 kernels currently require NVCC for compilation.")
-
-                requires_nvcc_hostlib_compilation = True
-
        if len(operation_list) > 0:
            cubin_image, host_lib, host_file = self.emit_compile_(
-                operation_list, compile_options, requires_nvcc_hostlib_compilation)
+                operation_list, compile_options)

            err, module = cuda.cuModuleLoadData(cubin_image)
            if err != cuda.CUresult.CUDA_SUCCESS:
--- a/python/cutlass/cpp/include/tensor_ref_view.h
+++ b/python/cutlass/cpp/include/tensor_ref_view.h
@ -43,10 +43,10 @@
 template<typename T, typename L, typename TF>
 void bind_tensor_ref_view(py::module &m, std::string name) {
    py::class_<cutlass::TensorRef<T, L>>(m, ("TensorRef" + name).c_str())
-        .def("__init__", [](cutlass::TensorRef<T, L>& tensor_ref, int64_t address, const L& layout_ ) {
+        .def(py::init([](int64_t address, const L& layout_ ) {
            T* ptr = reinterpret_cast< T*>(address);
-            new (&tensor_ref) cutlass::TensorRef<T, L>(ptr, layout_);
-        })
+            return new cutlass::TensorRef<T, L>(ptr, layout_);
+        }))
        .def("data", [](cutlass::TensorRef<T, L>& tensor_ref) {
            T* ptr = tensor_ref.data();
            return int64_t(ptr);
--- a/python/setup.py
+++ b/python/setup.py
@ -29,9 +29,12 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 #################################################################################################
-
+import copy
 import os
+from pybind11.setup_helpers import Pybind11Extension
+import setuptools
 from setuptools import setup
+from setuptools.command.build_ext import build_ext


 def _cutlass_path_from_dir() -> str:
@ -61,31 +64,57 @@ cutlass_path = (
    else _cutlass_path_from_dir()
 )

+
 cuda_install_path = (
    os.getenv('CUDA_INSTALL_PATH')
    if os.getenv('CUDA_INSTALL_PATH') is not None
    else _cuda_install_path_from_nvcc()
 )

-ext_modules = []

-try:
-    from pybind11.setup_helpers import Pybind11Extension, build_ext
-    include_dirs = [
-        cutlass_path + '/include',
-        cuda_install_path + '/include',
-        cutlass_path + '/tools/util/include',
-        cutlass_path + '/test',
-    ]
+class BuildExtension(build_ext):
+    """
+    Wrapper around `build_ext` to use NVCC when compiling the CUTLASS Python-C++ bindings.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)

-    ext_modules = [
-        Pybind11Extension('cutlass_bindings',
-                          ['cutlass/cpp/cutlass_bindings.cpp'],
-                          include_dirs=include_dirs,
-                          extra_compile_args=['-fpermissive', '-w', '-std=c++17', '-DCUTLASS_PYTHON_HOST_CC=1'])
-    ]
-except ImportError:
-    pass
+    def build_extensions(self):
+        original_compile = self.compiler._compile
+
+        def custom_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+            """
+            Wrapper around build_ext.compiler._compile method
+            """
+            postargs = copy.deepcopy(extra_postargs)
+            postargs = [f for f in postargs if f not in ['-g0', '-fvisibility=hidden']]
+            postargs.extend(["-Xcompiler='-fPIC'", "-Xcompiler='-g0'", "-Xcompiler='-O3'", '-x', 'cu'])
+            try:
+                original_compiler = self.compiler.compiler_so
+                self.compiler.set_executable('compiler_so', [f'{cuda_install_path}/bin/nvcc'])
+                original_compile(obj, src, ext, cc_args, postargs, pp_opts)
+            finally:
+                self.compiler.set_executable('compiler_so', original_compiler)
+
+        self.compiler._compile = custom_compile
+        super().build_extensions()
+
+
+include_dirs = [
+    cutlass_path + '/include',
+    cuda_install_path + '/include',
+    cutlass_path + '/tools/util/include',
+    cutlass_path + '/test',
+]
+
+
+ext_modules = [
+    Pybind11Extension('cutlass_bindings',
+                      ['cutlass/cpp/cutlass_bindings.cpp'],
+                      include_dirs=include_dirs,
+                      extra_compile_args=['-Xcompiler="-fpermissive"', '-w', '-std=c++17'],
+                      libraries=['cudart'])
+]


 setup(
@ -103,4 +132,7 @@ setup(
        'treelib'
        ],
    ext_modules=ext_modules,
+    cmdclass={
+                 'build_ext': BuildExtension
+             }
 )