Updates to Python interface for PyPI packaging (#1209)

* Updates

* Updates to notebooks
This commit is contained in:
Jack Kosaian
2023-11-28 12:52:12 -06:00
committed by GitHub
parent b5d8a5d9cc
commit 8098336d51
11 changed files with 279 additions and 59 deletions

View File

@ -18,9 +18,6 @@ A, B, C, D = [np.ones((4096, 4096), dtype=np.float16) for i in range(4)]
plan.run(A, B, C, D)
```
**NOTE:** The CUTLASS Python interface is currently an experimental release. The API may change in the future.
We welcome feedback from the community.
### Overview
The CUTLASS Python interface aims to provide an ease-of-use interface for using CUTLASS via Python. Toward this goal,
the CUTLASS Python interface attempts to:
@ -87,12 +84,17 @@ If these environment variables are not set, the installation process will infer
**NOTE:** The version of `cuda-python` installed must match the CUDA version in `CUDA_INSTALL_PATH`.
#### Installation
The CUTLASS Python interface can currently be installed by navigating to the root of the CUTLASS directory and performing
Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS.
```bash
pip install nvidia-cutlass
```
The CUTLASS Python interface can also be installed from source by navigating to the root of the CUTLASS directory and performing
```bash
pip install .
```
If you would like to be able to make changes to CULASS Python interface and have them reflected when using the interface, perform:
If you would like to be able to make changes to CUTLASS Python interface and have them reflected when using the interface, perform:
```bash
pip install -e .
```

View File

@ -85,7 +85,15 @@ this = sys.modules[__name__]
this.logger = logging.getLogger(__name__)
# RMM is only supported for Python 3.9+
this.use_rmm = (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3
if (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3:
try:
import rmm
this.use_rmm = True
except ImportError:
this.use_rmm = False
else:
this.use_rmm = False
def set_log_level(level: int):
"""
@ -134,9 +142,8 @@ def get_memory_pool():
return this.memory_pool
from cuda import cuda
from cuda import cuda, cudart
this._context = None
this._device_id = None
def initialize_cuda_context():
if this._device_id is not None:
@ -149,10 +156,10 @@ def initialize_cuda_context():
device_id = os.getenv("CUTLASS_CUDA_DEVICE_ID")
if device_id is None:
if not this.use_rmm:
# We must manually call cuInit in the absence of RMM
err, = cuda.cuInit(0)
if err != cuda.CUresult.CUDA_SUCCESS:
raise Exception(f"cuInit failed with error {err}")
# Manually call cuInit() and create context by making a runtime API call
err, = cudart.cudaFree(0)
if err != cudart.cudaError_t.cudaSuccess:
raise RuntimeError(f"cudaFree failed with error {err}")
err, device_count = cuda.cuDeviceGetCount()
if err != cuda.CUresult.CUDA_SUCCESS:
@ -163,16 +170,6 @@ def initialize_cuda_context():
this._device_id = device_id
if not this.use_rmm and this._context is None:
# We must manually initialize the context in the absence of RMM
err, device = cuda.cuDeviceGet(this._device_id)
if err != cuda.CUresult.CUDA_SUCCESS:
raise Exception(f"cuDeviceGet failed with error {err}")
err, this._context = cuda.cuCtxCreate(0, device)
if err != cuda.CUresult.CUDA_SUCCESS:
raise Exception(f"cuCtxCreate failed with error {err}")
def device_id() -> int:
initialize_cuda_context()

View File

@ -53,7 +53,7 @@ IncludeTemplate = r"""#include "${include}"
def compile_with_nvcc(cmd, source, error_file):
succeed = True
try:
subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
subprocess.check_output(cmd, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
error_message = e.output.decode()
with open(error_file, "w") as error_out:
@ -82,20 +82,19 @@ class CompilationOptions:
self.arch = arch
def get_str(self):
options = ""
opts = []
for flag in self.flags:
options += " " + flag
opts.append(flag)
for incl in self.include_paths:
options += " --include-path=%s" % incl
opts.append(f"--include-path={incl}")
arch_flag = " -arch=sm_%d" % self.arch
arch_flag = f"-arch=sm_{self.arch}"
if self.arch == 90:
arch_flag += "a"
options += arch_flag
opts.append(arch_flag)
return options
return " ".join(opts)
def get(self):
options = []
@ -104,9 +103,9 @@ class CompilationOptions:
options.append(bytes(str.encode(flag)))
for incl in self.include_paths:
options.append(bytes(str.encode("--include-path=%s" % incl)))
options.append(bytes(str.encode(f" --include-path={incl}")))
arch_flag = " -arch=sm_%d" % self.arch
arch_flag = f" -arch=sm_{self.arch}"
if self.arch == 90:
arch_flag += "a"
@ -323,34 +322,35 @@ class ArtifactManager:
"tarfile": temp_cubin.name,
}
cmd = SubstituteTemplate(cmd_template, values)
compile_with_nvcc(cmd, source_buffer_device, "./cutlass_python_compilation_device_error.txt")
compile_with_nvcc(cmd.split(" "), source_buffer_device, "./cutlass_python_compilation_device_error.txt")
# load the cubin image
with open(temp_cubin.name, "rb") as file:
cubin_image = file.read()
# Set up the host-side library code
cmd_template = (
"echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
% source_buffer_host
)
cmd = SubstituteTemplate(
cmd_template,
{
"cuda_install_path": cuda_install_path(),
"options": host_compilation_options.get_str(),
},
)
tempfile.tempdir = "./"
temp = tempfile.NamedTemporaryFile(
temp_src = tempfile.NamedTemporaryFile(
prefix="host_src", suffix=".cu", delete=True)
# Write the host source
with open(temp_src.name, "w") as outfile:
outfile.write(source_buffer_host)
temp_dst = tempfile.NamedTemporaryFile(
prefix="host_func", suffix=".so", delete=True)
cmd += " - -shared -o %s -lcudart -lcuda" % temp.name
compile_with_nvcc(cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
host_lib = ctypes.CDLL(temp.name)
# Set up host compilation arguments
cmd = []
cmd.append(f"{cuda_install_path()}/bin/nvcc")
cmd.extend(["-x", "cu", "-Xcompiler=-fpermissive", "-Xcompiler=-w", "-Xcompiler=-fPIC"])
cmd.extend(host_compilation_options.get_str().split(" "))
cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"])
return cubin_image, host_lib, temp
# Comile and load the library
compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
host_lib = ctypes.CDLL(temp_dst.name)
return cubin_image, host_lib, temp_dst
def add_module(self, operations, compile_options=None, bypass_cache=False):
"""

View File

@ -1,5 +1,12 @@
# Installation
## Installing a stable release
Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS.
```bash
pip install nvidia-cutlass
```
## Installing from source
Installing from source requires the latest CUDA Toolkit that matches the major.minor of CUDA Python installed.