Rename python/cutlass to python/cutlass_cppgen (#2652)

2025-09-18 13:26:57 -05:00
parent 4260d4aef9
commit 177a82e251
71 changed files with 1 additions and 1 deletions
--- a/python/cutlass_cppgen/backend/compiler.py
+++ b/python/cutlass_cppgen/backend/compiler.py
@ -0,0 +1,462 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ctypes
+import json
+import os
+import sqlite3
+import subprocess
+import tempfile
+
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+cudart = lazy_import("cuda.cudart")
+nvrtc = lazy_import("cuda.nvrtc")
+from cutlass_library import SubstituteTemplate
+
+import cutlass_cppgen
+from cutlass_cppgen import CACHE_FILE, CUTLASS_PATH, cuda_install_path, logger
+from cutlass_cppgen.backend.gemm_operation import GemmOperationUniversal
+from cutlass_cppgen.backend.library import ApiVersion
+from cutlass_cppgen.backend.utils.device import device_cc
+
+IncludeTemplate = r"""#include "${include}"
+"""
+
+
+def compile_with_nvcc(cmd, source, error_file):
+    succeed = True
+    try:
+        subprocess.check_output(cmd, stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+        error_message = e.output.decode()
+        with open(error_file, "w") as error_out:
+            error_log = "Compilation error for the following kernel: \n"
+            error_log += source
+            error_log += "\nError Message:\n"
+            error_log += error_message
+            error_out.write(error_log)
+        succeed = False
+    if not succeed:
+        # Print the error log to stdout if log level is set to warning or higher
+        # verbosity. Otherwise, simply point to the error log file.
+        logger.warning(error_log)
+        raise Exception(f"Invalid Kernel. See '{error_file}' for details.")
+
+
+class CompilationOptions:
+    """
+    Compilation options.
+    """
+
+    def __init__(self, flags, arch, include_paths=[]):
+        self.includes = []
+        self.include_paths = include_paths
+        self.flags = flags
+        self.arch = arch
+
+    def get_str(self):
+        opts = []
+        for flag in self.flags:
+            opts.append(flag)
+
+        for incl in self.include_paths:
+            opts.append(f"--include-path={incl}")
+
+        arch_flag = f"-arch=sm_{self.arch}"
+        if self.arch in [90, 100, 101, 103, 120, 121] and int(cutlass_cppgen.nvcc_version().split('.')[0]) >= 12:
+            arch_flag += "a"
+        opts.append(arch_flag)
+
+        return " ".join(opts)
+
+    def get(self):
+        options = []
+
+        for flag in self.flags:
+            options.append(bytes(str.encode(flag)))
+
+        for incl in self.include_paths:
+            options.append(bytes(str.encode(f" --include-path={incl}")))
+
+        arch_flag = f" -arch=sm_{self.arch}"
+        if self.arch in [90, 100, 101, 103, 120, 121]:
+            arch_flag += "a"
+
+        options.append(bytes(str.encode(arch_flag)))
+
+        return options
+
+
+def convertToBinaryData(filename):
+    with open(filename, "rb") as file:
+        blobData = file.read()
+    return blobData
+
+
+def CDLLBin(host_binary):
+    tempfile.tempdir = "./"
+    temp_so = tempfile.NamedTemporaryFile(prefix="host_func", suffix=".so", delete=True)
+    with open(temp_so.name, "wb") as file:
+        file.write(host_binary)
+    host_lib = ctypes.CDLL(temp_so.name)
+    return host_lib
+
+
+class ArtifactManager:
+    """
+    Artifact manager
+    """
+
+    def __init__(self) -> None:
+        connection = sqlite3.connect(CACHE_FILE)
+        cursor = connection.cursor()
+        # Create the table if it does not already exist
+        sqlite_create_table_query = """
+        CREATE TABLE IF NOT EXISTS compiled_operations(op_key TEXT NOT NULL UNIQUE,
+                                                        cubin BLOB NOT NULL,
+                                                        hostbin BLOB NOT NULL,
+                                                        op_name TEXT NOT NULL,
+                                                        op_attrs TEXT NOT NULL)
+        """
+        cursor.execute(sqlite_create_table_query)
+        connection.commit()
+        cursor.close()
+
+        self._nvrtc_compile_options = ["-std=c++17", "-default-device"]
+        self._nvcc_compile_options = [
+            "-std=c++17",
+            "--expt-relaxed-constexpr",
+            "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored",
+        ]
+        self.nvcc()
+        self.compiled_cache_device = {}
+        self.compiled_cache_host = {}
+
+    def nvrtc(self):
+        self.backend = "nvrtc"
+        self.default_compile_options = self._nvrtc_compile_options
+
+    def nvcc(self):
+        self.backend = "nvcc"
+        self.default_compile_options = self._nvcc_compile_options
+
+    def insert_operation(self, op_key, cubin, hostfile, op_name, op_attrs):
+        connection = sqlite3.connect(CACHE_FILE)
+        cursor = connection.cursor()
+        sqlite_insert_blob_query = """ INSERT OR IGNORE INTO compiled_operations (op_key, cubin, hostbin, op_name, op_attrs) VALUES (?, ?, ?, ?, ?)"""
+
+        hostbin = convertToBinaryData(hostfile)
+
+        data_tuple = (op_key, cubin, hostbin, op_name, json.dumps(op_attrs))
+
+        cursor.execute(sqlite_insert_blob_query, data_tuple)
+        connection.commit()
+        cursor.close()
+
+    def load_operation(self, op_key, extra_funcs):
+        connection = sqlite3.connect(CACHE_FILE)
+        cursor = connection.cursor()
+        sqlite_fetch_blob_query = """SELECT * from compiled_operations where op_key = ?"""
+        cursor.execute(sqlite_fetch_blob_query, (op_key,))
+        record = cursor.fetchall()
+        if len(record) == 0:
+            return False
+        for row in record:
+            key, cubin_image, host_binary, operation_name, op_attr = row
+            op_attr = json.loads(op_attr)
+            err, module = cuda.cuModuleLoadData(cubin_image)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("Cuda Error: {}".format(err))
+
+            err, kernel = cuda.cuModuleGetFunction(module, bytes(str.encode(operation_name)))
+            self.compiled_cache_device[key] = kernel
+
+            compiled_host_fns = {}
+            host_lib = CDLLBin(host_binary)
+
+            func_name = operation_name + "_get_params"
+            func = getattr(host_lib, func_name)
+            func.restype = ctypes.POINTER(ctypes.c_char * op_attr[0])
+            compiled_host_fns["get_args"] = func
+
+            func_name = operation_name + "_shared_memory_size"
+            func = getattr(host_lib, func_name)
+            compiled_host_fns["shared_memory_capacity"] = func()
+
+            for attr in op_attr:
+                if isinstance(attr, str):
+                    func_name = operation_name + "_" + attr
+                    func = getattr(host_lib, func_name)
+
+                    # Set the return type of the function
+                    if attr in extra_funcs and extra_funcs[attr] != None:
+                        func.restype = extra_funcs[attr]
+
+                    compiled_host_fns[attr] = func
+
+            self.compiled_cache_host[key] = compiled_host_fns
+        return True
+
+    def emit_compile_(self, operation_list, compilation_options, host_compilation_options):
+        """
+        Compile a list of kernels and store them into database
+        """
+        source_buffer_device = ""
+        source_buffer_host = ""
+        # 1. include
+        includes = []
+        for operation in operation_list:
+            for incl in operation.emitter.includes:
+                if incl not in includes:
+                    includes.append(incl)
+
+        includes_host = ["builtin_types.h", "device_launch_parameters.h", "cstddef"] + includes
+        for incl in includes:
+            source_buffer_device += SubstituteTemplate(
+                IncludeTemplate,
+                {"include": incl},
+            )
+
+        for incl in includes_host:
+            source_buffer_host += SubstituteTemplate(
+                IncludeTemplate,
+                {"include": incl},
+            )
+
+        # 2. Operations
+        for operation in operation_list:
+            source_buffer_device += operation.emit()
+            source_buffer_host += operation.emit()
+            values = {
+                "operation_name": operation.name(),
+                "operation_suffix": operation.emitter.operation_suffix,
+            }
+            source_buffer_device += SubstituteTemplate(
+                operation.KernelTemplate,
+                values,
+            )
+            source_buffer_host += SubstituteTemplate(operation.HostTemplate, values)
+
+        if self.backend == "nvrtc":
+            # 3. compile
+            err, program = nvrtc.nvrtcCreateProgram(
+                str.encode(source_buffer_device),
+                bytes(str.encode("module.cu")),
+                0, [], [])
+
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError("NVRTC Error: {}".format(err))
+
+            # Compile program
+            options = compilation_options.get()
+
+            err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                error_string = "NVRTC Error: {}\n".format(err)
+
+                # Get log from compilation
+                err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
+                if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                    raise RuntimeError("NVRTC Error: {}".format(err))
+
+                log = b" " * logSize
+                err, = nvrtc.nvrtcGetProgramLog(program, log)
+                if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                    raise RuntimeError("NVRTC Error: {}".format(err))
+
+                raise RuntimeError(error_string + log.decode() + source_buffer_device)
+
+            # Get data from compilation
+            err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError("NVRTC Error: {}".format(err))
+
+            cubin_image = b" " * dataSize
+            (err,) = nvrtc.nvrtcGetCUBIN(program, cubin_image)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError("NVRTC Error: {}".format(err))
+
+        else:  # with nvcc backend
+            # emit code
+            tempfile.tempdir = "./"
+            temp_cu = tempfile.NamedTemporaryFile(
+                prefix="kernel", suffix=".cu", delete=True)
+            temp_cubin = tempfile.NamedTemporaryFile(
+                prefix="kernel", suffix=".cubin", delete=True)
+            with open(temp_cu.name, "w") as file:
+                file.write(source_buffer_device)
+
+            # compile with nvcc
+            cmd_template = "${cuda_install_path}/bin/nvcc ${options} -cubin ${srcfile} -o ${tarfile}"
+            values = {
+                "cuda_install_path": cuda_install_path(),
+                "options": compilation_options.get_str(),
+                "srcfile": temp_cu.name,
+                "tarfile": temp_cubin.name,
+            }
+            cmd = SubstituteTemplate(cmd_template, values)
+            compile_with_nvcc(cmd.split(" "), source_buffer_device, "./cutlass_python_compilation_device_error.txt")
+
+            # load the cubin image
+            with open(temp_cubin.name, "rb") as file:
+                cubin_image = file.read()
+
+        tempfile.tempdir = "./"
+        temp_src = tempfile.NamedTemporaryFile(
+            prefix="host_src", suffix=".cu", delete=True)
+
+        # Write the host source
+        with open(temp_src.name, "w") as outfile:
+            outfile.write(source_buffer_host)
+
+        temp_dst = tempfile.NamedTemporaryFile(
+            prefix="host_func", suffix=".so", delete=True)
+
+        # Set up host compilation arguments
+        cmd = []
+        cmd.append(f"{cuda_install_path()}/bin/nvcc")
+        cmd.extend(["-x", "cu", "-Xcompiler=-fpermissive", "-Xcompiler=-w", "-Xcompiler=-fPIC"])
+        cmd.extend(host_compilation_options.get_str().split(" "))
+        cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"])
+
+        # Comile and load the library
+        compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
+        host_lib = ctypes.CDLL(temp_dst.name)
+
+        return cubin_image, host_lib, temp_dst
+
+    def add_module(self, operations, compile_options=None, bypass_cache=False):
+        """
+        Insert a new compiled device module
+        """
+        include_paths = [
+            cuda_install_path() + "/include",
+            CUTLASS_PATH + "/include",
+            CUTLASS_PATH + "/tools/util/include",
+            CUTLASS_PATH + "/python/cutlass/cpp/include",
+        ]
+
+        cutlass_cppgen.initialize_cuda_context()
+        arch = device_cc()
+
+        host_compile_options = CompilationOptions(
+            self._nvcc_compile_options, arch, include_paths)
+        if compile_options is None:
+            compile_options = CompilationOptions(
+                self.default_compile_options, arch, include_paths)
+        # save the cubin
+        operation_key = []
+        operation_list = []
+        for operation in operations:
+            # step 1: get kernel string as key
+            key = operation.rt_module.emit() + operation.procedural_name() + self.backend
+            # step 1: check if the operation is in cache
+            compiled_kernel = self.compiled_cache_device.get(key)
+
+            if compiled_kernel is None and not bypass_cache:
+                hit = self.load_operation(key, getattr( operation.rt_module, "extra_funcs", {}))
+                if hit:
+                    compiled_kernel = self.compiled_cache_device.get(key)
+                    assert compiled_kernel is not None
+            if compiled_kernel is not None:
+                operation.rt_module.kernel = compiled_kernel
+                compiled_host_fns = self.compiled_cache_host.get(key)
+                assert compiled_host_fns is not None
+                for key in compiled_host_fns.keys():
+                    setattr(operation.rt_module, key, compiled_host_fns[key])
+                operation.rt_module.initialize()
+            else:
+                operation_list.append(operation.rt_module)
+                operation_key.append(key)
+
+        if len(operation_list) > 0:
+            cubin_image, host_lib, host_file = self.emit_compile_(
+                operation_list, compile_options, host_compile_options)
+
+            err, module = cuda.cuModuleLoadData(cubin_image)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("Cuda Error: {}".format(err))
+
+            operation_name = []
+            operation_attr = []
+            for operation, key in zip(operation_list, operation_key):
+                # get device kernels
+                err, operation.kernel = cuda.cuModuleGetFunction(
+                    module,
+                    bytes(str.encode(operation.name()))
+                )
+                operation_name.append(operation.name())
+                self.compiled_cache_device[key] = operation.kernel
+                # get host functions
+                compiled_host_fns = {}
+                op_attr = []
+
+                # get param size
+                func_name = operation.name() + "_get_param_size"
+                func = getattr(host_lib, func_name)
+                param_size = func()
+
+                func_name = operation.name() + "_get_params"
+                func = getattr(host_lib, func_name)
+                func.argtype = operation.argtype
+                func.restype = ctypes.POINTER(ctypes.c_char * param_size)
+                setattr(operation, "get_args", func)
+                compiled_host_fns["get_args"] = func
+
+                # set shared memory size
+                func_name = operation.name() + "_shared_memory_size"
+                func = getattr(host_lib, func_name)
+                setattr(operation, "shared_memory_capacity", func())
+                compiled_host_fns["shared_memory_capacity"] = func()
+                # set the maximum dynamic shared size
+                operation.initialize()
+
+                # get extra functions
+                op_attr.append(param_size)
+
+                if hasattr(operation, "extra_funcs"):
+                    for suffix, ret_type  in operation.extra_funcs.items():
+                        func_name = operation.name() + "_" + suffix
+                        func = getattr(host_lib, func_name)
+                        if ret_type is not None:
+                            func.restype = ret_type
+                        setattr(operation, suffix, func)
+                        compiled_host_fns[suffix] = func
+                        op_attr.append(suffix)
+
+                operation_attr.append(op_attr)
+                self.compiled_cache_host[key] = compiled_host_fns
+
+            for (key, operation_name, operation_attr,) in zip(operation_key, operation_name, operation_attr):
+                self.insert_operation(
+                    key, cubin_image, host_file.name, operation_name, operation_attr)