Updated mma_sm80.h to avoid perf penalty due to reinterpret_cast<>. (#100)
- Updated mma_sm80.h to avoid perf penalty due to reinterpret_cast<>. - Enhancement to CUTLASS Utility Library's HostTensorPlanarComplex template to support copy-in and copy-out - Added test_examples target to build and test all CUTLASS examples - Minor edits to documentation to point to GTC 2020 webinar
This commit is contained in:
@ -161,6 +161,7 @@ compiled as C++11 or greater.
|
||||
#include <iostream>
|
||||
#include <cutlass/cutlass.h>
|
||||
#include <cutlass/numeric_types.h>
|
||||
#include <cutlass/core_io.h>
|
||||
|
||||
int main() {
|
||||
|
||||
@ -174,10 +175,13 @@ int main() {
|
||||
|
||||
## Launching a GEMM kernel in CUDA
|
||||
|
||||
**Example:** launch a mixed-precision GEMM targeting Turing Tensor Cores.
|
||||
**Example:** launch a mixed-precision GEMM targeting Turing Tensor Cores.
|
||||
|
||||
_Note, this example uses CUTLASS Utilities. Be sure `tools/util/include` is listed as an include path._
|
||||
```c++
|
||||
#include <cutlass/numeric_types.h>
|
||||
#include <cutlass/gemm/device/gemm.h>
|
||||
|
||||
#include <cutlass/util/host_tensor.h>
|
||||
|
||||
int main() {
|
||||
|
||||
Reference in New Issue
Block a user