CUTLASS 3.3.0 (#1167)
* Release 3.3.0 Adds support for mixed precision GEMMs On Hopper and Ampere Adds support for < 16B aligned GEMMs on Hopper Enhancements to EVT Enhancements to Python interface Enhancements to Sub-byte type handling in CuTe Several other bug-fixes and performance improvements. * minor doc update
This commit is contained in:
@ -269,6 +269,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tiles = [td for td in tiles if td.threadblock_shape[0] >= 128]\n",
|
||||
"idx = random.randint(0, len(tiles)-1)\n",
|
||||
"td = tiles[idx]\n",
|
||||
"print('Tile description {} is: {}'.format(idx, td))\n",
|
||||
|
||||
@ -29,7 +29,7 @@
|
||||
"import cutlass\n",
|
||||
"from cutlass.epilogue import relu\n",
|
||||
"from cutlass import Tensor as FakeTensor\n",
|
||||
"from cutlass.profiler import CUDAEventProfiler\n",
|
||||
"from cutlass.utils.profiler import CUDAEventProfiler\n",
|
||||
"\n",
|
||||
"# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
|
||||
"# omit this information.\n",
|
||||
@ -160,10 +160,6 @@
|
||||
" return example_epilogue(accum, alpha, C, beta, aux, bias)\n",
|
||||
"\n",
|
||||
"torch_reference = TorchReference()\n",
|
||||
"if hasattr(torch, \"compile\"):\n",
|
||||
" # If the torch.compile feature is available\n",
|
||||
" torch_reference = torch.compile(torch_reference)\n",
|
||||
"\n",
|
||||
"tensor_D_ref, tensor_F_ref = torch_reference(tensor_A, tensor_B, alpha, tensor_C, beta, aux, bias)\n",
|
||||
"\n",
|
||||
"assert torch.equal(tensor_D, tensor_D_ref)\n",
|
||||
|
||||
Reference in New Issue
Block a user