CUTLASS 3.3.0 (#1167)

* Release 3.3.0 Adds support for mixed precision GEMMs On Hopper and Ampere Adds support for < 16B aligned GEMMs on Hopper Enhancements to EVT Enhancements to Python interface Enhancements to Sub-byte type handling in CuTe Several other bug-fixes and performance improvements. * minor doc update
2023-11-02 08:09:05 -07:00
parent 922fb5108b
commit c008b4aea8
263 changed files with 16214 additions and 5008 deletions
--- a/examples/python/00_basic_gemm.ipynb
+++ b/examples/python/00_basic_gemm.ipynb
@ -269,6 +269,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "tiles = [td for td in tiles if td.threadblock_shape[0] >= 128]\n",
    "idx = random.randint(0, len(tiles)-1)\n",
    "td = tiles[idx]\n",
    "print('Tile description {} is: {}'.format(idx, td))\n",
--- a/examples/python/04_epilogue_visitor.ipynb
+++ b/examples/python/04_epilogue_visitor.ipynb
@ -29,7 +29,7 @@
    "import cutlass\n",
    "from cutlass.epilogue import relu\n",
    "from cutlass import Tensor as FakeTensor\n",
-    "from cutlass.profiler import CUDAEventProfiler\n",
+    "from cutlass.utils.profiler import CUDAEventProfiler\n",
    "\n",
    "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
    "# omit this information.\n",
@ -160,10 +160,6 @@
    "        return example_epilogue(accum, alpha, C, beta, aux, bias)\n",
    "\n",
    "torch_reference = TorchReference()\n",
-    "if hasattr(torch, \"compile\"):\n",
-    "    # If the torch.compile feature is available\n",
-    "    torch_reference = torch.compile(torch_reference)\n",
-    "\n",
    "tensor_D_ref, tensor_F_ref = torch_reference(tensor_A, tensor_B, alpha, tensor_C, beta, aux, bias)\n",
    "\n",
    "assert torch.equal(tensor_D, tensor_D_ref)\n",