From 8098336d512ef089a2f0e0fa172d5ff5cb18eca5 Mon Sep 17 00:00:00 2001
From: Jack Kosaian <jkosaian@nvidia.com>
Date: Tue, 28 Nov 2023 12:52:12 -0600
Subject: [PATCH] Updates to Python interface for PyPI packaging (#1209)

* Updates

* Updates to notebooks
---
 examples/python/00_basic_gemm.ipynb           | 44 +++++++++++++-
 examples/python/01_epilogue.ipynb             | 45 +++++++++++++-
 .../02_pytorch_extension_grouped_gemm.ipynb   | 46 +++++++++++++++
 examples/python/03_basic_conv2d.ipynb         | 42 ++++++++++++++
 examples/python/04_epilogue_visitor.ipynb     | 45 +++++++++++++-
 pyproject.toml                                |  3 +-
 python/README.md                              | 12 ++--
 python/cutlass/__init__.py                    | 31 +++++-----
 python/cutlass/backend/compiler.py            | 58 +++++++++----------
 python/docs_src/source/install.md             |  7 +++
 setup.cfg                                     |  5 +-
 11 files changed, 279 insertions(+), 59 deletions(-)

diff --git a/examples/python/00_basic_gemm.ipynb b/examples/python/00_basic_gemm.ipynb
index e7a130b6..afd6dab4 100644
--- a/examples/python/00_basic_gemm.ipynb
+++ b/examples/python/00_basic_gemm.ipynb
@@ -7,7 +7,46 @@
    "metadata": {},
    "source": [
     "# Basic example of using the CUTLASS Python interface\n",
-    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs.\n"
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs.\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/00_basic_gemm.ipynb)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df94d7e6",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71c7a069",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf16785d",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c819bb68",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#pip install nvidia-cutlass"
    ]
   },
   {
@@ -16,7 +55,8 @@
    "id": "962324fd",
    "metadata": {},
    "source": [
-    "We first import various packages needed for the example and construct the input and output tensors that will be used in our example.\n"
+    "## General setup\n",
+    "We first import various packages needed for the example and construct the input and output tensors that will be used in our example."
    ]
   },
   {
diff --git a/examples/python/01_epilogue.ipynb b/examples/python/01_epilogue.ipynb
index 13acbffd..a58446e4 100644
--- a/examples/python/01_epilogue.ipynb
+++ b/examples/python/01_epilogue.ipynb
@@ -7,14 +7,55 @@
    "metadata": {},
    "source": [
     "# Example of using elementwise activation functions in the CUTLASS Python interface\n",
-    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues.\n"
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues.\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/01_epilogue.ipynb)\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "3ca993fe",
+   "id": "28c916da",
    "metadata": {},
    "source": [
+    "## Prerequisites for running on Colab\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fcea8ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ec60b57",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1db9e51c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#pip install nvidia-cutlass"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "962324fd",
+   "metadata": {},
+   "source": [
+    "## General setup\n",
     "We first import various packages needed for the example and construct the input and output tensors that will be used in our example."
    ]
   },
diff --git a/examples/python/02_pytorch_extension_grouped_gemm.ipynb b/examples/python/02_pytorch_extension_grouped_gemm.ipynb
index ecd78280..9196af13 100644
--- a/examples/python/02_pytorch_extension_grouped_gemm.ipynb
+++ b/examples/python/02_pytorch_extension_grouped_gemm.ipynb
@@ -10,6 +10,52 @@
     "This notebook walks through a basic example of using the CUTLASS Python interface to declare\n",
     "a grouped GEMM kernel and export it as a PyTorch CUDA extension. Note that GEMM and Conv2d can also be exported as PyTorch CUDA extensions. \n",
     "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/02_pytorch_extension_grouped_gemm.ipynb)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d70560e",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc7c7458",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2107bb0d",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the CUTLASS Python interface and PyTorch. To do so, uncomment the following line and run the cell:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9852cb8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#pip install nvidia-cutlass torch --extra-index-url https://download.pytorch.org/whl/cu121"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "962324fd",
+   "metadata": {},
+   "source": [
     "## Background on grouped GEMM\n",
     "Grouped GEMM enables one to execute a set of GEMMs (each with potentially different sizes and strides)\n",
     "in a single CUDA kernel. It can be thought of as a generalized version of a pointer-array GEMM,\n",
diff --git a/examples/python/03_basic_conv2d.ipynb b/examples/python/03_basic_conv2d.ipynb
index 962add39..c428319a 100644
--- a/examples/python/03_basic_conv2d.ipynb
+++ b/examples/python/03_basic_conv2d.ipynb
@@ -8,6 +8,48 @@
     "\n",
     "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run Conv2d. \n",
     "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/03_basic_conv2d.ipynb)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites for running on Colab\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#pip install nvidia-cutlass"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## General setup\n",
     "We first import various packages needed for the example and construct the input and output tensors that will be used in our example."
    ]
   },
diff --git a/examples/python/04_epilogue_visitor.ipynb b/examples/python/04_epilogue_visitor.ipynb
index 69a32226..5a147bcb 100644
--- a/examples/python/04_epilogue_visitor.ipynb
+++ b/examples/python/04_epilogue_visitor.ipynb
@@ -7,14 +7,55 @@
    "metadata": {},
    "source": [
     "# Example of using epilogue visitor in the CUTLASS Python interface\n",
-    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues through CUTLASS Epilogue Visitor."
+    "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues through CUTLASS Epilogue Visitor.\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/04_epilogue_visitor.ipynb)\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "3ca993fe",
+   "id": "3a800e79",
    "metadata": {},
    "source": [
+    "## Prerequisites for running on Colab\n",
+    "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9cfff2c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "06706f00",
+   "metadata": {},
+   "source": [
+    "If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "491a7314",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!#pip install nvidia-cutlass"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "962324fd",
+   "metadata": {},
+   "source": [
+    "## General setup\n",
     "We first import various packages needed for the example, construct the input and output tensors that will be used in our example."
    ]
   },
diff --git a/pyproject.toml b/pyproject.toml
index 44723087..03d38dcd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,11 +3,12 @@ requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "cutlass"
+name = "nvidia-cutlass"
 version = "3.3.0.0"
 description = "CUTLASS"
 readme = "README.md"
 requires-python = ">=3.8"
+license = {file = "LICENSE.txt"}
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: BSD License",
diff --git a/python/README.md b/python/README.md
index d86c6b2d..e84b7963 100644
--- a/python/README.md
+++ b/python/README.md
@@ -18,9 +18,6 @@ A, B, C, D = [np.ones((4096, 4096), dtype=np.float16) for i in range(4)]
 plan.run(A, B, C, D)
 ```
 
-**NOTE:** The CUTLASS Python interface is currently an experimental release. The API may change in the future.
-We welcome feedback from the community.
-
 ### Overview
 The CUTLASS Python interface aims to provide an ease-of-use interface for using CUTLASS via Python. Toward this goal,
 the CUTLASS Python interface attempts to:
@@ -87,12 +84,17 @@ If these environment variables are not set, the installation process will infer
 **NOTE:** The version of `cuda-python` installed must match the CUDA version in `CUDA_INSTALL_PATH`.
 
 #### Installation
-The CUTLASS Python interface can currently be installed by navigating to the root of the CUTLASS directory and performing
+Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS.
+```bash
+pip install nvidia-cutlass
+```
+
+The CUTLASS Python interface can also be installed from source by navigating to the root of the CUTLASS directory and performing
 ```bash
 pip install .
 ```
 
-If you would like to be able to make changes to CULASS Python interface and have them reflected when using the interface, perform:
+If you would like to be able to make changes to CUTLASS Python interface and have them reflected when using the interface, perform:
 ```bash
 pip install -e .
 ```
diff --git a/python/cutlass/__init__.py b/python/cutlass/__init__.py
index 0af93357..0fd755ca 100644
--- a/python/cutlass/__init__.py
+++ b/python/cutlass/__init__.py
@@ -85,7 +85,15 @@ this = sys.modules[__name__]
 this.logger = logging.getLogger(__name__)
 
 # RMM is only supported for Python 3.9+
-this.use_rmm = (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3
+if (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3:
+    try:
+        import rmm
+        this.use_rmm = True
+    except ImportError:
+        this.use_rmm = False
+else:
+    this.use_rmm = False
+
 
 def set_log_level(level: int):
     """
@@ -134,9 +142,8 @@ def get_memory_pool():
     return this.memory_pool
 
 
-from cuda import cuda
+from cuda import cuda, cudart
 
-this._context = None
 this._device_id = None
 def initialize_cuda_context():
     if this._device_id is not None:
@@ -149,10 +156,10 @@ def initialize_cuda_context():
     device_id = os.getenv("CUTLASS_CUDA_DEVICE_ID")
     if device_id is None:
         if not this.use_rmm:
-            # We must manually call cuInit in the absence of RMM
-            err, = cuda.cuInit(0)
-            if err != cuda.CUresult.CUDA_SUCCESS:
-                raise Exception(f"cuInit failed with error {err}")
+            # Manually call cuInit() and create context by making a runtime API call
+            err, = cudart.cudaFree(0)
+            if err != cudart.cudaError_t.cudaSuccess:
+                raise RuntimeError(f"cudaFree failed with error {err}")
 
         err, device_count = cuda.cuDeviceGetCount()
         if err != cuda.CUresult.CUDA_SUCCESS:
@@ -163,16 +170,6 @@ def initialize_cuda_context():
 
     this._device_id = device_id
 
-    if not this.use_rmm and this._context is None:
-        # We must manually initialize the context in the absence of RMM
-        err, device = cuda.cuDeviceGet(this._device_id)
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise Exception(f"cuDeviceGet failed with error {err}")
-
-        err, this._context = cuda.cuCtxCreate(0, device)
-        if err != cuda.CUresult.CUDA_SUCCESS:
-            raise Exception(f"cuCtxCreate failed with error {err}")
-
 
 def device_id() -> int:
     initialize_cuda_context()
diff --git a/python/cutlass/backend/compiler.py b/python/cutlass/backend/compiler.py
index e04a4eb2..d1ed6296 100644
--- a/python/cutlass/backend/compiler.py
+++ b/python/cutlass/backend/compiler.py
@@ -53,7 +53,7 @@ IncludeTemplate = r"""#include "${include}"
 def compile_with_nvcc(cmd, source, error_file):
     succeed = True
     try:
-        subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
+        subprocess.check_output(cmd, stderr=subprocess.STDOUT)
     except subprocess.CalledProcessError as e:
         error_message = e.output.decode()
         with open(error_file, "w") as error_out:
@@ -82,20 +82,19 @@ class CompilationOptions:
         self.arch = arch
 
     def get_str(self):
-        options = ""
-
+        opts = []
         for flag in self.flags:
-            options += " " + flag
+            opts.append(flag)
 
         for incl in self.include_paths:
-            options += " --include-path=%s" % incl
+            opts.append(f"--include-path={incl}")
 
-        arch_flag = " -arch=sm_%d" % self.arch
+        arch_flag = f"-arch=sm_{self.arch}"
         if self.arch == 90:
             arch_flag += "a"
-        options += arch_flag
+        opts.append(arch_flag)
 
-        return options
+        return " ".join(opts)
 
     def get(self):
         options = []
@@ -104,9 +103,9 @@ class CompilationOptions:
             options.append(bytes(str.encode(flag)))
 
         for incl in self.include_paths:
-            options.append(bytes(str.encode("--include-path=%s" % incl)))
+            options.append(bytes(str.encode(f" --include-path={incl}")))
 
-        arch_flag = " -arch=sm_%d" % self.arch
+        arch_flag = f" -arch=sm_{self.arch}"
         if self.arch == 90:
             arch_flag += "a"
 
@@ -323,34 +322,35 @@ class ArtifactManager:
                 "tarfile": temp_cubin.name,
             }
             cmd = SubstituteTemplate(cmd_template, values)
-            compile_with_nvcc(cmd, source_buffer_device, "./cutlass_python_compilation_device_error.txt")
+            compile_with_nvcc(cmd.split(" "), source_buffer_device, "./cutlass_python_compilation_device_error.txt")
 
             # load the cubin image
             with open(temp_cubin.name, "rb") as file:
                 cubin_image = file.read()
 
-        # Set up the host-side library code
-        cmd_template = (
-            "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
-            % source_buffer_host
-        )
-        cmd = SubstituteTemplate(
-            cmd_template,
-            {
-                "cuda_install_path": cuda_install_path(),
-                "options": host_compilation_options.get_str(),
-            },
-        )
-
         tempfile.tempdir = "./"
-        temp = tempfile.NamedTemporaryFile(
+        temp_src = tempfile.NamedTemporaryFile(
+            prefix="host_src", suffix=".cu", delete=True)
+
+        # Write the host source
+        with open(temp_src.name, "w") as outfile:
+            outfile.write(source_buffer_host)
+
+        temp_dst = tempfile.NamedTemporaryFile(
             prefix="host_func", suffix=".so", delete=True)
 
-        cmd += " - -shared -o %s -lcudart -lcuda" % temp.name
-        compile_with_nvcc(cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
-        host_lib = ctypes.CDLL(temp.name)
+        # Set up host compilation arguments
+        cmd = []
+        cmd.append(f"{cuda_install_path()}/bin/nvcc")
+        cmd.extend(["-x", "cu", "-Xcompiler=-fpermissive", "-Xcompiler=-w", "-Xcompiler=-fPIC"])
+        cmd.extend(host_compilation_options.get_str().split(" "))
+        cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"])
 
-        return cubin_image, host_lib, temp
+        # Comile and load the library
+        compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
+        host_lib = ctypes.CDLL(temp_dst.name)
+
+        return cubin_image, host_lib, temp_dst
 
     def add_module(self, operations, compile_options=None, bypass_cache=False):
         """
diff --git a/python/docs_src/source/install.md b/python/docs_src/source/install.md
index 5d30740d..e0513fe1 100644
--- a/python/docs_src/source/install.md
+++ b/python/docs_src/source/install.md
@@ -1,5 +1,12 @@
 # Installation
 
+## Installing a stable release
+
+Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS.
+```bash
+pip install nvidia-cutlass
+```
+
 ## Installing from source
 
 Installing from source requires the latest CUDA Toolkit that matches the major.minor of CUDA Python installed.
diff --git a/setup.cfg b/setup.cfg
index 99b77096..78222b8c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [metadata]
-name = cutlass
+name = nvidia-cutlass
 version = 3.3.0.0
 
 [options]
@@ -28,3 +28,6 @@ include_package_data = True
 
 [options.package_data]
 cutlass_library.source = include/**/*, examples/**/*, tools/**/*
+
+[options.exclude_package_data]
+cutlass_library.source = include/**/*.py, examples/**/*.py, tools/**/*.py
\ No newline at end of file