From 8098336d512ef089a2f0e0fa172d5ff5cb18eca5 Mon Sep 17 00:00:00 2001 From: Jack Kosaian Date: Tue, 28 Nov 2023 12:52:12 -0600 Subject: [PATCH] Updates to Python interface for PyPI packaging (#1209) * Updates * Updates to notebooks --- examples/python/00_basic_gemm.ipynb | 44 +++++++++++++- examples/python/01_epilogue.ipynb | 45 +++++++++++++- .../02_pytorch_extension_grouped_gemm.ipynb | 46 +++++++++++++++ examples/python/03_basic_conv2d.ipynb | 42 ++++++++++++++ examples/python/04_epilogue_visitor.ipynb | 45 +++++++++++++- pyproject.toml | 3 +- python/README.md | 12 ++-- python/cutlass/__init__.py | 31 +++++----- python/cutlass/backend/compiler.py | 58 +++++++++---------- python/docs_src/source/install.md | 7 +++ setup.cfg | 5 +- 11 files changed, 279 insertions(+), 59 deletions(-) diff --git a/examples/python/00_basic_gemm.ipynb b/examples/python/00_basic_gemm.ipynb index e7a130b6..afd6dab4 100644 --- a/examples/python/00_basic_gemm.ipynb +++ b/examples/python/00_basic_gemm.ipynb @@ -7,7 +7,46 @@ "metadata": {}, "source": [ "# Basic example of using the CUTLASS Python interface\n", - "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs.\n" + "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs.\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/00_basic_gemm.ipynb)\n" + ] + }, + { + "cell_type": "markdown", + "id": "df94d7e6", + "metadata": {}, + "source": [ + "## Prerequisites for running on Colab\n", + "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71c7a069", + "metadata": {}, + "outputs": [], + "source": [ + "!#nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "id": "cf16785d", + "metadata": {}, + "source": [ + "If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c819bb68", + "metadata": {}, + "outputs": [], + "source": [ + "!#pip install nvidia-cutlass" ] }, { @@ -16,7 +55,8 @@ "id": "962324fd", "metadata": {}, "source": [ - "We first import various packages needed for the example and construct the input and output tensors that will be used in our example.\n" + "## General setup\n", + "We first import various packages needed for the example and construct the input and output tensors that will be used in our example." ] }, { diff --git a/examples/python/01_epilogue.ipynb b/examples/python/01_epilogue.ipynb index 13acbffd..a58446e4 100644 --- a/examples/python/01_epilogue.ipynb +++ b/examples/python/01_epilogue.ipynb @@ -7,14 +7,55 @@ "metadata": {}, "source": [ "# Example of using elementwise activation functions in the CUTLASS Python interface\n", - "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues.\n" + "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues.\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/01_epilogue.ipynb)\n" ] }, { "cell_type": "markdown", - "id": "3ca993fe", + "id": "28c916da", "metadata": {}, "source": [ + "## Prerequisites for running on Colab\n", + "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fcea8ea", + "metadata": {}, + "outputs": [], + "source": [ + "!#nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "id": "7ec60b57", + "metadata": {}, + "source": [ + "If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1db9e51c", + "metadata": {}, + "outputs": [], + "source": [ + "!#pip install nvidia-cutlass" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "962324fd", + "metadata": {}, + "source": [ + "## General setup\n", "We first import various packages needed for the example and construct the input and output tensors that will be used in our example." ] }, diff --git a/examples/python/02_pytorch_extension_grouped_gemm.ipynb b/examples/python/02_pytorch_extension_grouped_gemm.ipynb index ecd78280..9196af13 100644 --- a/examples/python/02_pytorch_extension_grouped_gemm.ipynb +++ b/examples/python/02_pytorch_extension_grouped_gemm.ipynb @@ -10,6 +10,52 @@ "This notebook walks through a basic example of using the CUTLASS Python interface to declare\n", "a grouped GEMM kernel and export it as a PyTorch CUDA extension. Note that GEMM and Conv2d can also be exported as PyTorch CUDA extensions. \n", "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/02_pytorch_extension_grouped_gemm.ipynb)\n" + ] + }, + { + "cell_type": "markdown", + "id": "2d70560e", + "metadata": {}, + "source": [ + "## Prerequisites for running on Colab\n", + "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc7c7458", + "metadata": {}, + "outputs": [], + "source": [ + "!#nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "id": "2107bb0d", + "metadata": {}, + "source": [ + "If running on Colab, you will need to install the CUTLASS Python interface and PyTorch. To do so, uncomment the following line and run the cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9852cb8", + "metadata": {}, + "outputs": [], + "source": [ + "!#pip install nvidia-cutlass torch --extra-index-url https://download.pytorch.org/whl/cu121" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "962324fd", + "metadata": {}, + "source": [ "## Background on grouped GEMM\n", "Grouped GEMM enables one to execute a set of GEMMs (each with potentially different sizes and strides)\n", "in a single CUDA kernel. It can be thought of as a generalized version of a pointer-array GEMM,\n", diff --git a/examples/python/03_basic_conv2d.ipynb b/examples/python/03_basic_conv2d.ipynb index 962add39..c428319a 100644 --- a/examples/python/03_basic_conv2d.ipynb +++ b/examples/python/03_basic_conv2d.ipynb @@ -8,6 +8,48 @@ "\n", "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run Conv2d. \n", "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/03_basic_conv2d.ipynb)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites for running on Colab\n", + "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!#nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!#pip install nvidia-cutlass" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## General setup\n", "We first import various packages needed for the example and construct the input and output tensors that will be used in our example." ] }, diff --git a/examples/python/04_epilogue_visitor.ipynb b/examples/python/04_epilogue_visitor.ipynb index 69a32226..5a147bcb 100644 --- a/examples/python/04_epilogue_visitor.ipynb +++ b/examples/python/04_epilogue_visitor.ipynb @@ -7,14 +7,55 @@ "metadata": {}, "source": [ "# Example of using epilogue visitor in the CUTLASS Python interface\n", - "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues through CUTLASS Epilogue Visitor." + "This notebook walks through a basic example of using the CUTLASS Python interface to declare, compile, and run GEMMs with different epilogues through CUTLASS Epilogue Visitor.\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NVIDIA/cutlass/tree/master/examples/python/04_epilogue_visitor.ipynb)\n" ] }, { "cell_type": "markdown", - "id": "3ca993fe", + "id": "3a800e79", "metadata": {}, "source": [ + "## Prerequisites for running on Colab\n", + "This notebook requires an NVIDIA GPU. If `nvidia-smi` fails, go to Runtime -> Change runtime type -> Hardware accelerator and confirm a GPU is selected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cfff2c8", + "metadata": {}, + "outputs": [], + "source": [ + "!#nvidia-smi" + ] + }, + { + "cell_type": "markdown", + "id": "06706f00", + "metadata": {}, + "source": [ + "If running on Colab, you will need to install the CUTLASS Python interface. To do so, uncomment the following line and run the cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "491a7314", + "metadata": {}, + "outputs": [], + "source": [ + "!#pip install nvidia-cutlass" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "962324fd", + "metadata": {}, + "source": [ + "## General setup\n", "We first import various packages needed for the example, construct the input and output tensors that will be used in our example." ] }, diff --git a/pyproject.toml b/pyproject.toml index 44723087..03d38dcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,11 +3,12 @@ requires = ["setuptools"] build-backend = "setuptools.build_meta" [project] -name = "cutlass" +name = "nvidia-cutlass" version = "3.3.0.0" description = "CUTLASS" readme = "README.md" requires-python = ">=3.8" +license = {file = "LICENSE.txt"} classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: BSD License", diff --git a/python/README.md b/python/README.md index d86c6b2d..e84b7963 100644 --- a/python/README.md +++ b/python/README.md @@ -18,9 +18,6 @@ A, B, C, D = [np.ones((4096, 4096), dtype=np.float16) for i in range(4)] plan.run(A, B, C, D) ``` -**NOTE:** The CUTLASS Python interface is currently an experimental release. The API may change in the future. -We welcome feedback from the community. - ### Overview The CUTLASS Python interface aims to provide an ease-of-use interface for using CUTLASS via Python. Toward this goal, the CUTLASS Python interface attempts to: @@ -87,12 +84,17 @@ If these environment variables are not set, the installation process will infer **NOTE:** The version of `cuda-python` installed must match the CUDA version in `CUDA_INSTALL_PATH`. #### Installation -The CUTLASS Python interface can currently be installed by navigating to the root of the CUTLASS directory and performing +Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS. +```bash +pip install nvidia-cutlass +``` + +The CUTLASS Python interface can also be installed from source by navigating to the root of the CUTLASS directory and performing ```bash pip install . ``` -If you would like to be able to make changes to CULASS Python interface and have them reflected when using the interface, perform: +If you would like to be able to make changes to CUTLASS Python interface and have them reflected when using the interface, perform: ```bash pip install -e . ``` diff --git a/python/cutlass/__init__.py b/python/cutlass/__init__.py index 0af93357..0fd755ca 100644 --- a/python/cutlass/__init__.py +++ b/python/cutlass/__init__.py @@ -85,7 +85,15 @@ this = sys.modules[__name__] this.logger = logging.getLogger(__name__) # RMM is only supported for Python 3.9+ -this.use_rmm = (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3 +if (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3: + try: + import rmm + this.use_rmm = True + except ImportError: + this.use_rmm = False +else: + this.use_rmm = False + def set_log_level(level: int): """ @@ -134,9 +142,8 @@ def get_memory_pool(): return this.memory_pool -from cuda import cuda +from cuda import cuda, cudart -this._context = None this._device_id = None def initialize_cuda_context(): if this._device_id is not None: @@ -149,10 +156,10 @@ def initialize_cuda_context(): device_id = os.getenv("CUTLASS_CUDA_DEVICE_ID") if device_id is None: if not this.use_rmm: - # We must manually call cuInit in the absence of RMM - err, = cuda.cuInit(0) - if err != cuda.CUresult.CUDA_SUCCESS: - raise Exception(f"cuInit failed with error {err}") + # Manually call cuInit() and create context by making a runtime API call + err, = cudart.cudaFree(0) + if err != cudart.cudaError_t.cudaSuccess: + raise RuntimeError(f"cudaFree failed with error {err}") err, device_count = cuda.cuDeviceGetCount() if err != cuda.CUresult.CUDA_SUCCESS: @@ -163,16 +170,6 @@ def initialize_cuda_context(): this._device_id = device_id - if not this.use_rmm and this._context is None: - # We must manually initialize the context in the absence of RMM - err, device = cuda.cuDeviceGet(this._device_id) - if err != cuda.CUresult.CUDA_SUCCESS: - raise Exception(f"cuDeviceGet failed with error {err}") - - err, this._context = cuda.cuCtxCreate(0, device) - if err != cuda.CUresult.CUDA_SUCCESS: - raise Exception(f"cuCtxCreate failed with error {err}") - def device_id() -> int: initialize_cuda_context() diff --git a/python/cutlass/backend/compiler.py b/python/cutlass/backend/compiler.py index e04a4eb2..d1ed6296 100644 --- a/python/cutlass/backend/compiler.py +++ b/python/cutlass/backend/compiler.py @@ -53,7 +53,7 @@ IncludeTemplate = r"""#include "${include}" def compile_with_nvcc(cmd, source, error_file): succeed = True try: - subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True) + subprocess.check_output(cmd, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: error_message = e.output.decode() with open(error_file, "w") as error_out: @@ -82,20 +82,19 @@ class CompilationOptions: self.arch = arch def get_str(self): - options = "" - + opts = [] for flag in self.flags: - options += " " + flag + opts.append(flag) for incl in self.include_paths: - options += " --include-path=%s" % incl + opts.append(f"--include-path={incl}") - arch_flag = " -arch=sm_%d" % self.arch + arch_flag = f"-arch=sm_{self.arch}" if self.arch == 90: arch_flag += "a" - options += arch_flag + opts.append(arch_flag) - return options + return " ".join(opts) def get(self): options = [] @@ -104,9 +103,9 @@ class CompilationOptions: options.append(bytes(str.encode(flag))) for incl in self.include_paths: - options.append(bytes(str.encode("--include-path=%s" % incl))) + options.append(bytes(str.encode(f" --include-path={incl}"))) - arch_flag = " -arch=sm_%d" % self.arch + arch_flag = f" -arch=sm_{self.arch}" if self.arch == 90: arch_flag += "a" @@ -323,34 +322,35 @@ class ArtifactManager: "tarfile": temp_cubin.name, } cmd = SubstituteTemplate(cmd_template, values) - compile_with_nvcc(cmd, source_buffer_device, "./cutlass_python_compilation_device_error.txt") + compile_with_nvcc(cmd.split(" "), source_buffer_device, "./cutlass_python_compilation_device_error.txt") # load the cubin image with open(temp_cubin.name, "rb") as file: cubin_image = file.read() - # Set up the host-side library code - cmd_template = ( - "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}" - % source_buffer_host - ) - cmd = SubstituteTemplate( - cmd_template, - { - "cuda_install_path": cuda_install_path(), - "options": host_compilation_options.get_str(), - }, - ) - tempfile.tempdir = "./" - temp = tempfile.NamedTemporaryFile( + temp_src = tempfile.NamedTemporaryFile( + prefix="host_src", suffix=".cu", delete=True) + + # Write the host source + with open(temp_src.name, "w") as outfile: + outfile.write(source_buffer_host) + + temp_dst = tempfile.NamedTemporaryFile( prefix="host_func", suffix=".so", delete=True) - cmd += " - -shared -o %s -lcudart -lcuda" % temp.name - compile_with_nvcc(cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt") - host_lib = ctypes.CDLL(temp.name) + # Set up host compilation arguments + cmd = [] + cmd.append(f"{cuda_install_path()}/bin/nvcc") + cmd.extend(["-x", "cu", "-Xcompiler=-fpermissive", "-Xcompiler=-w", "-Xcompiler=-fPIC"]) + cmd.extend(host_compilation_options.get_str().split(" ")) + cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"]) - return cubin_image, host_lib, temp + # Comile and load the library + compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt") + host_lib = ctypes.CDLL(temp_dst.name) + + return cubin_image, host_lib, temp_dst def add_module(self, operations, compile_options=None, bypass_cache=False): """ diff --git a/python/docs_src/source/install.md b/python/docs_src/source/install.md index 5d30740d..e0513fe1 100644 --- a/python/docs_src/source/install.md +++ b/python/docs_src/source/install.md @@ -1,5 +1,12 @@ # Installation +## Installing a stable release + +Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS. +```bash +pip install nvidia-cutlass +``` + ## Installing from source Installing from source requires the latest CUDA Toolkit that matches the major.minor of CUDA Python installed. diff --git a/setup.cfg b/setup.cfg index 99b77096..78222b8c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -name = cutlass +name = nvidia-cutlass version = 3.3.0.0 [options] @@ -28,3 +28,6 @@ include_package_data = True [options.package_data] cutlass_library.source = include/**/*, examples/**/*, tools/**/* + +[options.exclude_package_data] +cutlass_library.source = include/**/*.py, examples/**/*.py, tools/**/*.py \ No newline at end of file