CUTLASS 3.3.0 (#1167)

* Release 3.3.0

Adds support for mixed precision GEMMs On Hopper and Ampere
Adds support for < 16B aligned GEMMs on Hopper
Enhancements to EVT
Enhancements to Python interface
Enhancements to Sub-byte type handling in CuTe
Several other bug-fixes and performance improvements.

* minor doc update
This commit is contained in:
Pradeep Ramani
2023-11-02 08:09:05 -07:00
committed by GitHub
parent 922fb5108b
commit c008b4aea8
263 changed files with 16214 additions and 5008 deletions

View File

@ -269,6 +269,7 @@
"metadata": {},
"outputs": [],
"source": [
"tiles = [td for td in tiles if td.threadblock_shape[0] >= 128]\n",
"idx = random.randint(0, len(tiles)-1)\n",
"td = tiles[idx]\n",
"print('Tile description {} is: {}'.format(idx, td))\n",

View File

@ -29,7 +29,7 @@
"import cutlass\n",
"from cutlass.epilogue import relu\n",
"from cutlass import Tensor as FakeTensor\n",
"from cutlass.profiler import CUDAEventProfiler\n",
"from cutlass.utils.profiler import CUDAEventProfiler\n",
"\n",
"# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
"# omit this information.\n",
@ -160,10 +160,6 @@
" return example_epilogue(accum, alpha, C, beta, aux, bias)\n",
"\n",
"torch_reference = TorchReference()\n",
"if hasattr(torch, \"compile\"):\n",
" # If the torch.compile feature is available\n",
" torch_reference = torch.compile(torch_reference)\n",
"\n",
"tensor_D_ref, tensor_F_ref = torch_reference(tensor_A, tensor_B, alpha, tensor_C, beta, aux, bias)\n",
"\n",
"assert torch.equal(tensor_D, tensor_D_ref)\n",