Updates to Python interface for PyPI packaging (#1209)

* Updates * Updates to notebooks
2023-11-28 12:52:12 -06:00
parent b5d8a5d9cc
commit 8098336d51
11 changed files with 279 additions and 59 deletions
--- a/python/README.md
+++ b/python/README.md
@ -18,9 +18,6 @@ A, B, C, D = [np.ones((4096, 4096), dtype=np.float16) for i in range(4)]
 plan.run(A, B, C, D)
 ```

-**NOTE:** The CUTLASS Python interface is currently an experimental release. The API may change in the future.
-We welcome feedback from the community.
-
 ### Overview
 The CUTLASS Python interface aims to provide an ease-of-use interface for using CUTLASS via Python. Toward this goal,
 the CUTLASS Python interface attempts to:
@ -87,12 +84,17 @@ If these environment variables are not set, the installation process will infer
 **NOTE:** The version of `cuda-python` installed must match the CUDA version in `CUDA_INSTALL_PATH`.

 #### Installation
-The CUTLASS Python interface can currently be installed by navigating to the root of the CUTLASS directory and performing
+Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS.
+```bash
+pip install nvidia-cutlass
+```
+
+The CUTLASS Python interface can also be installed from source by navigating to the root of the CUTLASS directory and performing
 ```bash
 pip install .
 ```

-If you would like to be able to make changes to CULASS Python interface and have them reflected when using the interface, perform:
+If you would like to be able to make changes to CUTLASS Python interface and have them reflected when using the interface, perform:
 ```bash
 pip install -e .
 ```
--- a/python/cutlass/init.py
+++ b/python/cutlass/init.py
@ -85,7 +85,15 @@ this = sys.modules[__name__]
 this.logger = logging.getLogger(__name__)

 # RMM is only supported for Python 3.9+
-this.use_rmm = (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3
+if (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3:
+    try:
+        import rmm
+        this.use_rmm = True
+    except ImportError:
+        this.use_rmm = False
+else:
+    this.use_rmm = False
+

 def set_log_level(level: int):
    """
@ -134,9 +142,8 @@ def get_memory_pool():
    return this.memory_pool


-from cuda import cuda
+from cuda import cuda, cudart

-this._context = None
 this._device_id = None
 def initialize_cuda_context():
    if this._device_id is not None:
@ -149,10 +156,10 @@ def initialize_cuda_context():
    device_id = os.getenv("CUTLASS_CUDA_DEVICE_ID")
    if device_id is None:
        if not this.use_rmm:
-            # We must manually call cuInit in the absence of RMM
-            err, = cuda.cuInit(0)
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise Exception(f"cuInit failed with error {err}")
+            # Manually call cuInit() and create context by making a runtime API call
+            err, = cudart.cudaFree(0)
+            if err != cudart.cudaError_t.cudaSuccess:
+                raise RuntimeError(f"cudaFree failed with error {err}")

        err, device_count = cuda.cuDeviceGetCount()
        if err != cuda.CUresult.CUDA_SUCCESS:
@ -163,16 +170,6 @@ def initialize_cuda_context():

    this._device_id = device_id

-    if not this.use_rmm and this._context is None:
-        # We must manually initialize the context in the absence of RMM
-        err, device = cuda.cuDeviceGet(this._device_id)
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise Exception(f"cuDeviceGet failed with error {err}")
-
-        err, this._context = cuda.cuCtxCreate(0, device)
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise Exception(f"cuCtxCreate failed with error {err}")
-

 def device_id() -> int:
    initialize_cuda_context()
--- a/python/cutlass/backend/compiler.py
+++ b/python/cutlass/backend/compiler.py
@ -53,7 +53,7 @@ IncludeTemplate = r"""#include "${include}"
 def compile_with_nvcc(cmd, source, error_file):
    succeed = True
    try:
-        subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
+        subprocess.check_output(cmd, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        error_message = e.output.decode()
        with open(error_file, "w") as error_out:
@ -82,20 +82,19 @@ class CompilationOptions:
        self.arch = arch

    def get_str(self):
-        options = ""
-
+        opts = []
        for flag in self.flags:
-            options += " " + flag
+            opts.append(flag)

        for incl in self.include_paths:
-            options += " --include-path=%s" % incl
+            opts.append(f"--include-path={incl}")

-        arch_flag = " -arch=sm_%d" % self.arch
+        arch_flag = f"-arch=sm_{self.arch}"
        if self.arch == 90:
            arch_flag += "a"
-        options += arch_flag
+        opts.append(arch_flag)

-        return options
+        return " ".join(opts)

    def get(self):
        options = []
@ -104,9 +103,9 @@ class CompilationOptions:
            options.append(bytes(str.encode(flag)))

        for incl in self.include_paths:
-            options.append(bytes(str.encode("--include-path=%s" % incl)))
+            options.append(bytes(str.encode(f" --include-path={incl}")))

-        arch_flag = " -arch=sm_%d" % self.arch
+        arch_flag = f" -arch=sm_{self.arch}"
        if self.arch == 90:
            arch_flag += "a"

@ -323,34 +322,35 @@ class ArtifactManager:
                "tarfile": temp_cubin.name,
            }
            cmd = SubstituteTemplate(cmd_template, values)
-            compile_with_nvcc(cmd, source_buffer_device, "./cutlass_python_compilation_device_error.txt")
+            compile_with_nvcc(cmd.split(" "), source_buffer_device, "./cutlass_python_compilation_device_error.txt")

            # load the cubin image
            with open(temp_cubin.name, "rb") as file:
                cubin_image = file.read()

-        # Set up the host-side library code
-        cmd_template = (
-            "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
-            % source_buffer_host
-        )
-        cmd = SubstituteTemplate(
-            cmd_template,
-            {
-                "cuda_install_path": cuda_install_path(),
-                "options": host_compilation_options.get_str(),
-            },
-        )
-
        tempfile.tempdir = "./"
-        temp = tempfile.NamedTemporaryFile(
+        temp_src = tempfile.NamedTemporaryFile(
+            prefix="host_src", suffix=".cu", delete=True)
+
+        # Write the host source
+        with open(temp_src.name, "w") as outfile:
+            outfile.write(source_buffer_host)
+
+        temp_dst = tempfile.NamedTemporaryFile(
            prefix="host_func", suffix=".so", delete=True)

-        cmd += " - -shared -o %s -lcudart -lcuda" % temp.name
-        compile_with_nvcc(cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
-        host_lib = ctypes.CDLL(temp.name)
+        # Set up host compilation arguments
+        cmd = []
+        cmd.append(f"{cuda_install_path()}/bin/nvcc")
+        cmd.extend(["-x", "cu", "-Xcompiler=-fpermissive", "-Xcompiler=-w", "-Xcompiler=-fPIC"])
+        cmd.extend(host_compilation_options.get_str().split(" "))
+        cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"])

-        return cubin_image, host_lib, temp
+        # Comile and load the library
+        compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
+        host_lib = ctypes.CDLL(temp_dst.name)
+
+        return cubin_image, host_lib, temp_dst

    def add_module(self, operations, compile_options=None, bypass_cache=False):
        """
--- a/python/docs_src/source/install.md
+++ b/python/docs_src/source/install.md
@ -1,5 +1,12 @@
 # Installation

+## Installing a stable release
+
+Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS.
+```bash
+pip install nvidia-cutlass
+```
+
 ## Installing from source

 Installing from source requires the latest CUDA Toolkit that matches the major.minor of CUDA Python installed.