v4.2 release. (#2587)

* Fix default cluster callback values to 1 to avoid profiler failure when these values are not set in command line. * v4.2 release.
2025-08-23 06:11:24 +08:00
parent 11cad1f67b
commit a49a78ffef
351 changed files with 28182 additions and 2032 deletions
--- a/examples/python/deprecated/00_basic_gemm.ipynb
+++ b/examples/python/deprecated/00_basic_gemm.ipynb
@ -69,7 +69,7 @@
    "import numpy as np\n",
    "import random\n",
    "\n",
-    "import cutlass\n",
+    "import cutlass_cppgen\n",
    "\n",
    "# This controls whether the C++ GEMM declaration will be printed at each step. \n",
    "# Set to `False` to omit this information.\n",
@ -106,7 +106,7 @@
   "metadata": {},
   "source": [
    "## Declaring and running a GEMM\n",
-    "To get started, one only needs to provide the tensors declared above to the `cutlass.op.Gemm` call.\n",
+    "To get started, one only needs to provide the tensors declared above to the `cutlass_cppgen.op.Gemm` call.\n",
    "This sets up a default GEMM operation for the given device on which you are running.\n",
    "\n",
    "Assuming that we are running on SM80, this default to using a GEMM that leverages FP16 Tensor Core operations.\n",
@ -123,7 +123,7 @@
   "source": [
    "# We specify `element_accumulator` here so as to match the kernel run by NumPy below. However,\n",
    "# specifying `element_accumulator` is not required if it is the same as `element`\n",
-    "plan = cutlass.Gemm(element=dtype, layout=cutlass.LayoutType.RowMajor, element_accumulator=np.float32)\n",
+    "plan = cutlass_cppgen.Gemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor, element_accumulator=np.float32)\n",
    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
   ]
  },
@ -133,7 +133,7 @@
   "id": "4a5856de",
   "metadata": {},
   "source": [
-    "There are many other ways to construct a plan from `cutlass.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass.op.Gemm` constructor."
+    "There are many other ways to construct a plan from `cutlass_cppgen.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass_cppgen.op.Gemm` constructor."
   ]
  },
  {
@ -172,7 +172,7 @@
   "metadata": {},
   "source": [
    "## Changing operation modes\n",
-    "By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to `cutlass.op.Gemm` is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.\n",
+    "By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to `cutlass_cppgen.op.Gemm` is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.\n",
    "\n",
    "The operation mode currently in use can be returned via the `plan.opclass` property. In this case Tensor Core operations."
   ]
@ -197,7 +197,7 @@
    "\n",
    "As is shown in the printed output, the emitted kernel uses template parameters that fit CUTLASS's SIMT GEMMs.\n",
    "\n",
-    "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization."
+    "Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass_cppgen.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization."
   ]
  },
  {
@ -208,7 +208,7 @@
   "outputs": [],
   "source": [
    "tensor_D_simt = np.zeros(tensor_C.shape).astype(type_D)\n",
-    "plan.opclass = cutlass.OpcodeClass.Simt\n",
+    "plan.opclass = cutlass_cppgen.OpcodeClass.Simt\n",
    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D_simt, alpha, beta, print_module=print_module)"
   ]
  },
@ -262,7 +262,7 @@
    "alpha = np.float16(1.)\n",
    "beta = np.float16(2.)\n",
    "\n",
-    "plan.opclass = cutlass.OpcodeClass.TensorOp\n",
+    "plan.opclass = cutlass_cppgen.OpcodeClass.TensorOp\n",
    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
   ]
  },
@ -336,13 +336,13 @@
    "# Stream K is exposed through the threadblock swizzle method for pre-SM90 kernels,\n",
    "# and via the tile_scheduler attribute of the TileDescription for post-SM90 kernels\n",
    "if plan.cc < 90:\n",
-    "    plan.swizzling_functor = cutlass.swizzle.ThreadblockSwizzleStreamK\n",
+    "    plan.swizzling_functor = cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK\n",
    "    plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)\n",
    "else:\n",
    "    # Stream-K is currently only supported for warp-specialized cooperative kernels\n",
-    "    td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative\n",
-    "    td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative\n",
-    "    td.tile_scheduler = cutlass.TileSchedulerType.StreamK\n",
+    "    td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative\n",
+    "    td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative\n",
+    "    td.tile_scheduler = cutlass_cppgen.TileSchedulerType.StreamK\n",
    "\n",
    "    plan.compile(td)\n",
    "    plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
@ -391,12 +391,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from cutlass.backend.utils.device import device_cc\n",
+    "from cutlass_cppgen.backend.utils.device import device_cc\n",
    "\n",
    "# 3xTF32 requires SM80 or higher\n",
    "if device_cc() >= 80:\n",
-    "    plan = cutlass.op.Gemm(element=np.float32, layout=cutlass.LayoutType.RowMajor)\n",
-    "    plan.math_operation = cutlass.MathOperation.multiply_add_fast_f32\n",
+    "    plan = cutlass_cppgen.op.Gemm(element=np.float32, layout=cutlass_cppgen.LayoutType.RowMajor)\n",
+    "    plan.math_operation = cutlass_cppgen.MathOperation.multiply_add_fast_f32\n",
    "\n",
    "    # Create input/output tensors in FP32\n",
    "    A, B = [np.ones((128, 128)).astype(np.float32) for _ in range(2)]\n",
@ -433,9 +433,9 @@
    "\n",
    "# FP8 is supported through the CUTLASS Python interface on SM90 and higher\n",
    "if device_cc() >= 90:\n",
-    "    plan = cutlass.op.Gemm(element=torch.float8_e4m3fn, element_C=torch.float32, element_accumulator=torch.float32,\n",
-    "                        layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.ColumnMajor,\n",
-    "                        layout_C=cutlass.LayoutType.ColumnMajor)\n",
+    "    plan = cutlass_cppgen.op.Gemm(element=torch.float8_e4m3fn, element_C=torch.float32, element_accumulator=torch.float32,\n",
+    "                        layout_A=cutlass_cppgen.LayoutType.RowMajor, layout_B=cutlass_cppgen.LayoutType.ColumnMajor,\n",
+    "                        layout_C=cutlass_cppgen.LayoutType.ColumnMajor)\n",
    "\n",
    "    # Create input/output tensors in FP8\n",
    "    A, B = [torch.ones((128, 128)).to(torch.float8_e4m3fn).to(\"cuda\") for _ in range(2)]\n",
--- a/examples/python/deprecated/01_epilogue.ipynb
+++ b/examples/python/deprecated/01_epilogue.ipynb
@ -68,7 +68,7 @@
   "source": [
    "import numpy as np\n",
    "\n",
-    "import cutlass\n",
+    "import cutlass_cppgen\n",
    "\n",
    "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
    "# omit this information.\n",
@ -112,7 +112,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "plan = cutlass.op.Gemm(element=np.float16, layout=cutlass.LayoutType.RowMajor)\n",
+    "plan = cutlass_cppgen.op.Gemm(element=np.float16, layout=cutlass_cppgen.LayoutType.RowMajor)\n",
    "plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
   ]
  },
--- a/examples/python/deprecated/02_pytorch_extension_grouped_gemm.ipynb
+++ b/examples/python/deprecated/02_pytorch_extension_grouped_gemm.ipynb
@ -75,7 +75,7 @@
    "\n",
    "## Declaring a grouped GEMM via the CUTLASS Python interface\n",
    "A grouped GEMM operation is declared similarly to a GEMM operation in the CUTLASS Python interface: one\n",
-    "simply calls `cutlass.op.GroupedGemm`."
+    "simply calls `cutlass_cppgen.op.GroupedGemm`."
   ]
  },
  {
@ -85,11 +85,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import cutlass\n",
+    "import cutlass_cppgen\n",
    "import torch\n",
    "\n",
    "dtype = torch.float16\n",
-    "plan = cutlass.op.GroupedGemm(element=dtype, layout=cutlass.LayoutType.RowMajor)"
+    "plan = cutlass_cppgen.op.GroupedGemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor)"
   ]
  },
  {
@ -174,7 +174,7 @@
   "outputs": [],
   "source": [
    "op = plan.construct()\n",
-    "grouped_gemm = cutlass.emit.pytorch(op, name='grouped_gemm', cc=plan.cc, sourcedir='out', jit=True)"
+    "grouped_gemm = cutlass_cppgen.emit.pytorch(op, name='grouped_gemm', cc=plan.cc, sourcedir='out', jit=True)"
   ]
  },
  {
@ -182,7 +182,7 @@
   "id": "c8ca3991",
   "metadata": {},
   "source": [
-    "The `cutlass.emit.pytorch` function emits:\n",
+    "The `cutlass_cppgen.emit.pytorch` function emits:\n",
    "* `out/grouped_gemm_kernel.cu`: This file contains the declaration of the CUTLASS kernel and a method to call it from PyTorch tensors\n",
    "* `out/grouped_gemm.cpp`: This file contains a C++ wrapper around the aforementioned CUTLASS kernel\n",
    "* `setup.py`: This file contains the `setuptools` script for building and installing the generated extension\n",
--- a/examples/python/deprecated/03_basic_conv2d.ipynb
+++ b/examples/python/deprecated/03_basic_conv2d.ipynb
@ -62,7 +62,7 @@
    "import torch\n",
    "import random\n",
    "\n",
-    "import cutlass\n",
+    "import cutlass_cppgen\n",
    "\n",
    "# This controls whether the C++ GEMM declaration will be printed at each step. \n",
    "# Set to `false` to omit this information.\n",
@ -80,7 +80,7 @@
    "dilation = (1, 1)\n",
    "\n",
    "# Compute the output size [N, P, Q, K]\n",
-    "N, P, Q, K = cutlass.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)\n",
+    "N, P, Q, K = cutlass_cppgen.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)\n",
    "\n",
    "dtype = torch.float16\n",
    "type_A = torch.float16\n",
@ -111,7 +111,7 @@
   "source": [
    "## Declaring and running a Conv2d Fprop\n",
    "\n",
-    "We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n",
+    "We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass_cppgen.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n",
    "\n",
    "Assuming that we are runing on SM80, the default is a Conv2d that leverages FP16 Tensor Core operations.\n",
    "\n",
@ -125,7 +125,7 @@
   "outputs": [],
   "source": [
    "# Specifying `element_accumulator` is not required if it is the same as `element`\n",
-    "plan = cutlass.Conv2dFprop(element=dtype, element_accumulator=torch.float32)\n",
+    "plan = cutlass_cppgen.Conv2dFprop(element=dtype, element_accumulator=torch.float32)\n",
    "plan.run(input, weight, tensor_C, output, stride, padding, dilation, alpha, beta, print_module=print_module)"
   ]
  },
@ -133,7 +133,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "There are many other ways to construct a plan from `cutlass.op.Conv2dFprop` (e.g., by specifying the types of each operand, by providing representative tensors as input). For more details on these, see the documentation in the `cutlass.op.Conv2dFprop` constructor.\n",
+    "There are many other ways to construct a plan from `cutlass_cppgen.op.Conv2dFprop` (e.g., by specifying the types of each operand, by providing representative tensors as input). For more details on these, see the documentation in the `cutlass_cppgen.op.Conv2dFprop` constructor.\n",
    "\n",
    "We then compare the output to running the Conv2d using PyTorch. PyTorch use NCHW layout by default, so permutations are required."
   ]
@ -200,7 +200,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "plan_dgrad = cutlass.Conv2dDgrad(element=dtype, element_accumulator=torch.float32)\n",
+    "plan_dgrad = cutlass_cppgen.Conv2dDgrad(element=dtype, element_accumulator=torch.float32)\n",
    "plan_dgrad.run(grad_output, weight, tensor_C_dgrad, grad_input, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
    "\n",
    "grad_input_torch = alpha * torch.nn.grad.conv2d_input(\n",
@ -225,7 +225,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "plan_wgrad = cutlass.Conv2dWgrad(element=dtype, element_accumulator=torch.float32)\n",
+    "plan_wgrad = cutlass_cppgen.Conv2dWgrad(element=dtype, element_accumulator=torch.float32)\n",
    "plan_wgrad.run(grad_output, input, tensor_C_wgrad, grad_weight, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
    "\n",
    "grad_weight_torch = alpha * torch.nn.grad.conv2d_weight(\n",
--- a/examples/python/deprecated/04_epilogue_visitor.ipynb
+++ b/examples/python/deprecated/04_epilogue_visitor.ipynb
@ -67,17 +67,17 @@
   "outputs": [],
   "source": [
    "import torch\n",
-    "import cutlass\n",
-    "from cutlass.epilogue import relu\n",
-    "from cutlass import Tensor as FakeTensor\n",
-    "from cutlass.utils.profiler import CUDAEventProfiler\n",
+    "import cutlass_cppgen\n",
+    "from cutlass_cppgen.epilogue import relu\n",
+    "from cutlass_cppgen import Tensor as FakeTensor\n",
+    "from cutlass_cppgen.utils.profiler import CUDAEventProfiler\n",
    "\n",
    "# This controls whether ther C++ GEMM declaration will be printed at each step. Set to `false` to\n",
    "# omit this information.\n",
    "print_module = True\n",
    "\n",
    "# The Epilogue Visitor feature currently only works for SM80 and 90\n",
-    "from cutlass.backend.utils.device import device_cc\n",
+    "from cutlass_cppgen.backend.utils.device import device_cc\n",
    "if device_cc() not in [80, 90]:\n",
    "    import sys\n",
    "    sys.exit()\n",
@ -99,7 +99,7 @@
    "tensor_C = torch.ceil(torch.empty(size=(m, n), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
    "tensor_D = torch.zeros_like(tensor_C)\n",
    "\n",
-    "plan = cutlass.op.Gemm(element=torch.float16, layout=cutlass.LayoutType.RowMajor, element_accumulator=torch.float32)"
+    "plan = cutlass_cppgen.op.Gemm(element=torch.float16, layout=cutlass_cppgen.LayoutType.RowMajor, element_accumulator=torch.float32)"
   ]
  },
  {
@ -115,7 +115,7 @@
    "\n",
    "The example tensors is a dictionary with tensor names as keys and reference tensors as values. The reference tensors can be `float`, `torch.Tensor`, `numpy.ndarray`, or our `FakeTensor`. They provides the shape and data type information of the inputs and outputs of the epilogue.\n",
    "\n",
-    "The epilogue can be generated simply through `cutlass.evt.trace(<epilogue function>, <example_tensors>)`."
+    "The epilogue can be generated simply through `cutlass_cppgen.evt.trace(<epilogue function>, <example_tensors>)`."
   ]
  },
  {
@ -139,7 +139,7 @@
    "bias = torch.ceil(torch.empty(size=(m, 1), dtype=type_C, device=\"cuda\").uniform_(scope_min, scope_max))\n",
    "tensor_F = torch.zeros_like(tensor_D)\n",
    "examples_tensors = {\n",
-    "    \"accum\": FakeTensor(element=torch.float32, shape=(m, n), layout_tag=cutlass.LayoutType.RowMajor),\n",
+    "    \"accum\": FakeTensor(element=torch.float32, shape=(m, n), layout_tag=cutlass_cppgen.LayoutType.RowMajor),\n",
    "    \"alpha\": alpha,\n",
    "    \"C\": tensor_C,\n",
    "    \"beta\": beta,\n",
@ -150,7 +150,7 @@
    "}\n",
    "\n",
    "# Trace the epilogue visitor\n",
-    "epilogue_visitor = cutlass.epilogue.trace(example_epilogue, examples_tensors)"
+    "epilogue_visitor = cutlass_cppgen.epilogue.trace(example_epilogue, examples_tensors)"
   ]
  },
  {