cutlass/python/cutlass_cppgen/backend/compiler.py

#################################################################################################
#
# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################

import ctypes
import json
import os
import sqlite3
import subprocess
import tempfile

from cutlass_cppgen.utils.lazy_import import lazy_import
cuda = lazy_import("cuda.cuda")
cudart = lazy_import("cuda.cudart")
nvrtc = lazy_import("cuda.nvrtc")
from cutlass_library import SubstituteTemplate

import cutlass_cppgen
from cutlass_cppgen import CACHE_FILE, CUTLASS_PATH, cuda_install_path, logger
from cutlass_cppgen.backend.gemm_operation import GemmOperationUniversal
from cutlass_cppgen.backend.library import ApiVersion
from cutlass_cppgen.backend.utils.device import device_cc

IncludeTemplate = r"""#include "${include}"
"""


def compile_with_nvcc(cmd, source, error_file):
    succeed = True
    try:
        subprocess.check_output(cmd, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        error_message = e.output.decode()
        with open(error_file, "w") as error_out:
            error_log = "Compilation error for the following kernel: \n"
            error_log += source
            error_log += "\nError Message:\n"
            error_log += error_message
            error_out.write(error_log)
        succeed = False
    if not succeed:
        # Print the error log to stdout if log level is set to warning or higher
        # verbosity. Otherwise, simply point to the error log file.
        logger.warning(error_log)
        raise Exception(f"Invalid Kernel. See '{error_file}' for details.")


class CompilationOptions:
    """
    Compilation options.
    """

    def __init__(self, flags, arch, include_paths=[]):
        self.includes = []
        self.include_paths = include_paths
        self.flags = flags
        self.arch = arch

    def get_str(self):
        opts = []
        for flag in self.flags:
            opts.append(flag)

        for incl in self.include_paths:
            opts.append(f"--include-path={incl}")

        arch_flag = f"-arch=sm_{self.arch}"
        if self.arch in [90, 100, 101, 103, 120, 121] and int(cutlass_cppgen.nvcc_version().split('.')[0]) >= 12:
            arch_flag += "a"
        opts.append(arch_flag)

        return " ".join(opts)

    def get(self):
        options = []

        for flag in self.flags:
            options.append(bytes(str.encode(flag)))

        for incl in self.include_paths:
            options.append(bytes(str.encode(f" --include-path={incl}")))

        arch_flag = f" -arch=sm_{self.arch}"
        if self.arch in [90, 100, 101, 103, 120, 121]:
            arch_flag += "a"

        options.append(bytes(str.encode(arch_flag)))

        return options


def convertToBinaryData(filename):
    with open(filename, "rb") as file:
        blobData = file.read()
    return blobData


def CDLLBin(host_binary):
    tempfile.tempdir = "./"
    temp_so = tempfile.NamedTemporaryFile(prefix="host_func", suffix=".so", delete=True)
    with open(temp_so.name, "wb") as file:
        file.write(host_binary)
    host_lib = ctypes.CDLL(temp_so.name)
    return host_lib


class ArtifactManager:
    """
    Artifact manager
    """

    def __init__(self) -> None:
        connection = sqlite3.connect(CACHE_FILE)
        cursor = connection.cursor()
        # Create the table if it does not already exist
        sqlite_create_table_query = """
        CREATE TABLE IF NOT EXISTS compiled_operations(op_key TEXT NOT NULL UNIQUE,
                                                        cubin BLOB NOT NULL,
                                                        hostbin BLOB NOT NULL,
                                                        op_name TEXT NOT NULL,
                                                        op_attrs TEXT NOT NULL)
        """
        cursor.execute(sqlite_create_table_query)
        connection.commit()
        cursor.close()

        self._nvrtc_compile_options = ["-std=c++17", "-default-device"]
        self._nvcc_compile_options = [
            "-std=c++17",
            "--expt-relaxed-constexpr",
            "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored",
        ]
        self.nvcc()
        self.compiled_cache_device = {}
        self.compiled_cache_host = {}

    def nvrtc(self):
        self.backend = "nvrtc"
        self.default_compile_options = self._nvrtc_compile_options

    def nvcc(self):
        self.backend = "nvcc"
        self.default_compile_options = self._nvcc_compile_options

    def insert_operation(self, op_key, cubin, hostfile, op_name, op_attrs):
        connection = sqlite3.connect(CACHE_FILE)
        cursor = connection.cursor()
        sqlite_insert_blob_query = """ INSERT OR IGNORE INTO compiled_operations (op_key, cubin, hostbin, op_name, op_attrs) VALUES (?, ?, ?, ?, ?)"""

        hostbin = convertToBinaryData(hostfile)

        data_tuple = (op_key, cubin, hostbin, op_name, json.dumps(op_attrs))

        cursor.execute(sqlite_insert_blob_query, data_tuple)
        connection.commit()
        cursor.close()

    def load_operation(self, op_key, extra_funcs):
        connection = sqlite3.connect(CACHE_FILE)
        cursor = connection.cursor()
        sqlite_fetch_blob_query = """SELECT * from compiled_operations where op_key = ?"""
        cursor.execute(sqlite_fetch_blob_query, (op_key,))
        record = cursor.fetchall()
        if len(record) == 0:
            return False
        for row in record:
            key, cubin_image, host_binary, operation_name, op_attr = row
            op_attr = json.loads(op_attr)
            err, module = cuda.cuModuleLoadData(cubin_image)
            if err != cuda.CUresult.CUDA_SUCCESS:
                raise RuntimeError("Cuda Error: {}".format(err))

            err, kernel = cuda.cuModuleGetFunction(module, bytes(str.encode(operation_name)))
            self.compiled_cache_device[key] = kernel

            compiled_host_fns = {}
            host_lib = CDLLBin(host_binary)

            func_name = operation_name + "_get_params"
            func = getattr(host_lib, func_name)
            func.restype = ctypes.POINTER(ctypes.c_char * op_attr[0])
            compiled_host_fns["get_args"] = func

            func_name = operation_name + "_shared_memory_size"
            func = getattr(host_lib, func_name)
            compiled_host_fns["shared_memory_capacity"] = func()

            for attr in op_attr:
                if isinstance(attr, str):
                    func_name = operation_name + "_" + attr
                    func = getattr(host_lib, func_name)

                    # Set the return type of the function
                    if attr in extra_funcs and extra_funcs[attr] != None:
                        func.restype = extra_funcs[attr]

                    compiled_host_fns[attr] = func

            self.compiled_cache_host[key] = compiled_host_fns
        return True

    def emit_compile_(self, operation_list, compilation_options, host_compilation_options):
        """
        Compile a list of kernels and store them into database
        """
        source_buffer_device = ""
        source_buffer_host = ""
        # 1. include
        includes = []
        for operation in operation_list:
            for incl in operation.emitter.includes:
                if incl not in includes:
                    includes.append(incl)

        includes_host = ["builtin_types.h", "device_launch_parameters.h", "cstddef"] + includes
        for incl in includes:
            source_buffer_device += SubstituteTemplate(
                IncludeTemplate,
                {"include": incl},
            )

        for incl in includes_host:
            source_buffer_host += SubstituteTemplate(
                IncludeTemplate,
                {"include": incl},
            )

        # 2. Operations
        for operation in operation_list:
            source_buffer_device += operation.emit()
            source_buffer_host += operation.emit()
            values = {
                "operation_name": operation.name(),
                "operation_suffix": operation.emitter.operation_suffix,
            }
            source_buffer_device += SubstituteTemplate(
                operation.KernelTemplate,
                values,
            )
            source_buffer_host += SubstituteTemplate(operation.HostTemplate, values)

        if self.backend == "nvrtc":
            # 3. compile
            err, program = nvrtc.nvrtcCreateProgram(
                str.encode(source_buffer_device),
                bytes(str.encode("module.cu")),
                0, [], [])

            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
                raise RuntimeError("NVRTC Error: {}".format(err))

            # Compile program
            options = compilation_options.get()

            err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
                error_string = "NVRTC Error: {}\n".format(err)

                # Get log from compilation
                err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
                if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
                    raise RuntimeError("NVRTC Error: {}".format(err))

                log = b" " * logSize
                err, = nvrtc.nvrtcGetProgramLog(program, log)
                if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
                    raise RuntimeError("NVRTC Error: {}".format(err))

                raise RuntimeError(error_string + log.decode() + source_buffer_device)

            # Get data from compilation
            err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
                raise RuntimeError("NVRTC Error: {}".format(err))

            cubin_image = b" " * dataSize
            (err,) = nvrtc.nvrtcGetCUBIN(program, cubin_image)
            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
                raise RuntimeError("NVRTC Error: {}".format(err))

        else:  # with nvcc backend
            # emit code
            tempfile.tempdir = "./"
            temp_cu = tempfile.NamedTemporaryFile(
                prefix="kernel", suffix=".cu", delete=True)
            temp_cubin = tempfile.NamedTemporaryFile(
                prefix="kernel", suffix=".cubin", delete=True)
            with open(temp_cu.name, "w") as file:
                file.write(source_buffer_device)

            # compile with nvcc
            cmd_template = "${cuda_install_path}/bin/nvcc ${options} -cubin ${srcfile} -o ${tarfile}"
            values = {
                "cuda_install_path": cuda_install_path(),
                "options": compilation_options.get_str(),
                "srcfile": temp_cu.name,
                "tarfile": temp_cubin.name,
            }
            cmd = SubstituteTemplate(cmd_template, values)
            compile_with_nvcc(cmd.split(" "), source_buffer_device, "./cutlass_python_compilation_device_error.txt")

            # load the cubin image
            with open(temp_cubin.name, "rb") as file:
                cubin_image = file.read()

        tempfile.tempdir = "./"
        temp_src = tempfile.NamedTemporaryFile(
            prefix="host_src", suffix=".cu", delete=True)

        # Write the host source
        with open(temp_src.name, "w") as outfile:
            outfile.write(source_buffer_host)

        temp_dst = tempfile.NamedTemporaryFile(
            prefix="host_func", suffix=".so", delete=True)

        # Set up host compilation arguments
        cmd = []
        cmd.append(f"{cuda_install_path()}/bin/nvcc")
        cmd.extend(["-x", "cu", "-Xcompiler=-fpermissive", "-Xcompiler=-w", "-Xcompiler=-fPIC"])
        cmd.extend(host_compilation_options.get_str().split(" "))
        cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"])

        # Comile and load the library
        compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
        host_lib = ctypes.CDLL(temp_dst.name)

        return cubin_image, host_lib, temp_dst

    def add_module(self, operations, compile_options=None, bypass_cache=False):
        """
        Insert a new compiled device module
        """
        include_paths = [
            cuda_install_path() + "/include",
            CUTLASS_PATH + "/include",
            CUTLASS_PATH + "/tools/util/include",
            CUTLASS_PATH + "/python/cutlass/cpp/include",
        ]

        cutlass_cppgen.initialize_cuda_context()
        arch = device_cc()

        host_compile_options = CompilationOptions(
            self._nvcc_compile_options, arch, include_paths)
        if compile_options is None:
            compile_options = CompilationOptions(
                self.default_compile_options, arch, include_paths)
        # save the cubin
        operation_key = []
        operation_list = []
        for operation in operations:
            # step 1: get kernel string as key
            key = operation.rt_module.emit() + operation.procedural_name() + self.backend
            # step 1: check if the operation is in cache
            compiled_kernel = self.compiled_cache_device.get(key)

            if compiled_kernel is None and not bypass_cache:
                hit = self.load_operation(key, getattr( operation.rt_module, "extra_funcs", {}))
                if hit:
                    compiled_kernel = self.compiled_cache_device.get(key)
                    assert compiled_kernel is not None
            if compiled_kernel is not None:
                operation.rt_module.kernel = compiled_kernel
                compiled_host_fns = self.compiled_cache_host.get(key)
                assert compiled_host_fns is not None
                for key in compiled_host_fns.keys():
                    setattr(operation.rt_module, key, compiled_host_fns[key])
                operation.rt_module.initialize()
            else:
                operation_list.append(operation.rt_module)
                operation_key.append(key)

        if len(operation_list) > 0:
            cubin_image, host_lib, host_file = self.emit_compile_(
                operation_list, compile_options, host_compile_options)

            err, module = cuda.cuModuleLoadData(cubin_image)
            if err != cuda.CUresult.CUDA_SUCCESS:
                raise RuntimeError("Cuda Error: {}".format(err))

            operation_name = []
            operation_attr = []
            for operation, key in zip(operation_list, operation_key):
                # get device kernels
                err, operation.kernel = cuda.cuModuleGetFunction(
                    module,
                    bytes(str.encode(operation.name()))
                )
                operation_name.append(operation.name())
                self.compiled_cache_device[key] = operation.kernel
                # get host functions
                compiled_host_fns = {}
                op_attr = []

                # get param size
                func_name = operation.name() + "_get_param_size"
                func = getattr(host_lib, func_name)
                param_size = func()

                func_name = operation.name() + "_get_params"
                func = getattr(host_lib, func_name)
                func.argtype = operation.argtype
                func.restype = ctypes.POINTER(ctypes.c_char * param_size)
                setattr(operation, "get_args", func)
                compiled_host_fns["get_args"] = func

                # set shared memory size
                func_name = operation.name() + "_shared_memory_size"
                func = getattr(host_lib, func_name)
                setattr(operation, "shared_memory_capacity", func())
                compiled_host_fns["shared_memory_capacity"] = func()
                # set the maximum dynamic shared size
                operation.initialize()

                # get extra functions
                op_attr.append(param_size)

                if hasattr(operation, "extra_funcs"):
                    for suffix, ret_type  in operation.extra_funcs.items():
                        func_name = operation.name() + "_" + suffix
                        func = getattr(host_lib, func_name)
                        if ret_type is not None:
                            func.restype = ret_type
                        setattr(operation, suffix, func)
                        compiled_host_fns[suffix] = func
                        op_attr.append(suffix)

                operation_attr.append(op_attr)
                self.compiled_cache_host[key] = compiled_host_fns

            for (key, operation_name, operation_attr,) in zip(operation_key, operation_name, operation_attr):
                self.insert_operation(
                    key, cubin_image, host_file.name, operation_name, operation_attr)