Updates to Python interface for PyPI packaging (#1209)
* Updates * Updates to notebooks
This commit is contained in:
@ -18,9 +18,6 @@ A, B, C, D = [np.ones((4096, 4096), dtype=np.float16) for i in range(4)]
|
||||
plan.run(A, B, C, D)
|
||||
```
|
||||
|
||||
**NOTE:** The CUTLASS Python interface is currently an experimental release. The API may change in the future.
|
||||
We welcome feedback from the community.
|
||||
|
||||
### Overview
|
||||
The CUTLASS Python interface aims to provide an ease-of-use interface for using CUTLASS via Python. Toward this goal,
|
||||
the CUTLASS Python interface attempts to:
|
||||
@ -87,12 +84,17 @@ If these environment variables are not set, the installation process will infer
|
||||
**NOTE:** The version of `cuda-python` installed must match the CUDA version in `CUDA_INSTALL_PATH`.
|
||||
|
||||
#### Installation
|
||||
The CUTLASS Python interface can currently be installed by navigating to the root of the CUTLASS directory and performing
|
||||
Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS.
|
||||
```bash
|
||||
pip install nvidia-cutlass
|
||||
```
|
||||
|
||||
The CUTLASS Python interface can also be installed from source by navigating to the root of the CUTLASS directory and performing
|
||||
```bash
|
||||
pip install .
|
||||
```
|
||||
|
||||
If you would like to be able to make changes to CULASS Python interface and have them reflected when using the interface, perform:
|
||||
If you would like to be able to make changes to CUTLASS Python interface and have them reflected when using the interface, perform:
|
||||
```bash
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
@ -85,7 +85,15 @@ this = sys.modules[__name__]
|
||||
this.logger = logging.getLogger(__name__)
|
||||
|
||||
# RMM is only supported for Python 3.9+
|
||||
this.use_rmm = (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3
|
||||
if (sys.version_info.major == 3 and sys.version_info.major > 8) or sys.version_info.major > 3:
|
||||
try:
|
||||
import rmm
|
||||
this.use_rmm = True
|
||||
except ImportError:
|
||||
this.use_rmm = False
|
||||
else:
|
||||
this.use_rmm = False
|
||||
|
||||
|
||||
def set_log_level(level: int):
|
||||
"""
|
||||
@ -134,9 +142,8 @@ def get_memory_pool():
|
||||
return this.memory_pool
|
||||
|
||||
|
||||
from cuda import cuda
|
||||
from cuda import cuda, cudart
|
||||
|
||||
this._context = None
|
||||
this._device_id = None
|
||||
def initialize_cuda_context():
|
||||
if this._device_id is not None:
|
||||
@ -149,10 +156,10 @@ def initialize_cuda_context():
|
||||
device_id = os.getenv("CUTLASS_CUDA_DEVICE_ID")
|
||||
if device_id is None:
|
||||
if not this.use_rmm:
|
||||
# We must manually call cuInit in the absence of RMM
|
||||
err, = cuda.cuInit(0)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise Exception(f"cuInit failed with error {err}")
|
||||
# Manually call cuInit() and create context by making a runtime API call
|
||||
err, = cudart.cudaFree(0)
|
||||
if err != cudart.cudaError_t.cudaSuccess:
|
||||
raise RuntimeError(f"cudaFree failed with error {err}")
|
||||
|
||||
err, device_count = cuda.cuDeviceGetCount()
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
@ -163,16 +170,6 @@ def initialize_cuda_context():
|
||||
|
||||
this._device_id = device_id
|
||||
|
||||
if not this.use_rmm and this._context is None:
|
||||
# We must manually initialize the context in the absence of RMM
|
||||
err, device = cuda.cuDeviceGet(this._device_id)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise Exception(f"cuDeviceGet failed with error {err}")
|
||||
|
||||
err, this._context = cuda.cuCtxCreate(0, device)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise Exception(f"cuCtxCreate failed with error {err}")
|
||||
|
||||
|
||||
def device_id() -> int:
|
||||
initialize_cuda_context()
|
||||
|
||||
@ -53,7 +53,7 @@ IncludeTemplate = r"""#include "${include}"
|
||||
def compile_with_nvcc(cmd, source, error_file):
|
||||
succeed = True
|
||||
try:
|
||||
subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
|
||||
subprocess.check_output(cmd, stderr=subprocess.STDOUT)
|
||||
except subprocess.CalledProcessError as e:
|
||||
error_message = e.output.decode()
|
||||
with open(error_file, "w") as error_out:
|
||||
@ -82,20 +82,19 @@ class CompilationOptions:
|
||||
self.arch = arch
|
||||
|
||||
def get_str(self):
|
||||
options = ""
|
||||
|
||||
opts = []
|
||||
for flag in self.flags:
|
||||
options += " " + flag
|
||||
opts.append(flag)
|
||||
|
||||
for incl in self.include_paths:
|
||||
options += " --include-path=%s" % incl
|
||||
opts.append(f"--include-path={incl}")
|
||||
|
||||
arch_flag = " -arch=sm_%d" % self.arch
|
||||
arch_flag = f"-arch=sm_{self.arch}"
|
||||
if self.arch == 90:
|
||||
arch_flag += "a"
|
||||
options += arch_flag
|
||||
opts.append(arch_flag)
|
||||
|
||||
return options
|
||||
return " ".join(opts)
|
||||
|
||||
def get(self):
|
||||
options = []
|
||||
@ -104,9 +103,9 @@ class CompilationOptions:
|
||||
options.append(bytes(str.encode(flag)))
|
||||
|
||||
for incl in self.include_paths:
|
||||
options.append(bytes(str.encode("--include-path=%s" % incl)))
|
||||
options.append(bytes(str.encode(f" --include-path={incl}")))
|
||||
|
||||
arch_flag = " -arch=sm_%d" % self.arch
|
||||
arch_flag = f" -arch=sm_{self.arch}"
|
||||
if self.arch == 90:
|
||||
arch_flag += "a"
|
||||
|
||||
@ -323,34 +322,35 @@ class ArtifactManager:
|
||||
"tarfile": temp_cubin.name,
|
||||
}
|
||||
cmd = SubstituteTemplate(cmd_template, values)
|
||||
compile_with_nvcc(cmd, source_buffer_device, "./cutlass_python_compilation_device_error.txt")
|
||||
compile_with_nvcc(cmd.split(" "), source_buffer_device, "./cutlass_python_compilation_device_error.txt")
|
||||
|
||||
# load the cubin image
|
||||
with open(temp_cubin.name, "rb") as file:
|
||||
cubin_image = file.read()
|
||||
|
||||
# Set up the host-side library code
|
||||
cmd_template = (
|
||||
"echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
|
||||
% source_buffer_host
|
||||
)
|
||||
cmd = SubstituteTemplate(
|
||||
cmd_template,
|
||||
{
|
||||
"cuda_install_path": cuda_install_path(),
|
||||
"options": host_compilation_options.get_str(),
|
||||
},
|
||||
)
|
||||
|
||||
tempfile.tempdir = "./"
|
||||
temp = tempfile.NamedTemporaryFile(
|
||||
temp_src = tempfile.NamedTemporaryFile(
|
||||
prefix="host_src", suffix=".cu", delete=True)
|
||||
|
||||
# Write the host source
|
||||
with open(temp_src.name, "w") as outfile:
|
||||
outfile.write(source_buffer_host)
|
||||
|
||||
temp_dst = tempfile.NamedTemporaryFile(
|
||||
prefix="host_func", suffix=".so", delete=True)
|
||||
|
||||
cmd += " - -shared -o %s -lcudart -lcuda" % temp.name
|
||||
compile_with_nvcc(cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
|
||||
host_lib = ctypes.CDLL(temp.name)
|
||||
# Set up host compilation arguments
|
||||
cmd = []
|
||||
cmd.append(f"{cuda_install_path()}/bin/nvcc")
|
||||
cmd.extend(["-x", "cu", "-Xcompiler=-fpermissive", "-Xcompiler=-w", "-Xcompiler=-fPIC"])
|
||||
cmd.extend(host_compilation_options.get_str().split(" "))
|
||||
cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"])
|
||||
|
||||
return cubin_image, host_lib, temp
|
||||
# Comile and load the library
|
||||
compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
|
||||
host_lib = ctypes.CDLL(temp_dst.name)
|
||||
|
||||
return cubin_image, host_lib, temp_dst
|
||||
|
||||
def add_module(self, operations, compile_options=None, bypass_cache=False):
|
||||
"""
|
||||
|
||||
@ -1,5 +1,12 @@
|
||||
# Installation
|
||||
|
||||
## Installing a stable release
|
||||
|
||||
Stable releases of the CUTLASS Python interface are available via the `nvidia-cutlass` PyPI package. Any other packages with the name `cutlass` are not affiliated with NVIDIA CUTLASS.
|
||||
```bash
|
||||
pip install nvidia-cutlass
|
||||
```
|
||||
|
||||
## Installing from source
|
||||
|
||||
Installing from source requires the latest CUDA Toolkit that matches the major.minor of CUDA Python installed.
|
||||
|
||||
Reference in New Issue
Block a user