v4.2 release. (#2587)

* Fix default cluster callback values to 1 to avoid profiler failure when these values are not set in command line. * v4.2 release.
2025-08-23 06:11:24 +08:00
parent 11cad1f67b
commit a49a78ffef
351 changed files with 28182 additions and 2032 deletions
--- a/examples/python/deprecated/03_basic_conv2d.ipynb
+++ b/examples/python/deprecated/03_basic_conv2d.ipynb
@ -62,7 +62,7 @@
    "import torch\n",
    "import random\n",
    "\n",
-    "import cutlass\n",
+    "import cutlass_cppgen\n",
    "\n",
    "# This controls whether the C++ GEMM declaration will be printed at each step. \n",
    "# Set to `false` to omit this information.\n",
@ -80,7 +80,7 @@
    "dilation = (1, 1)\n",
    "\n",
    "# Compute the output size [N, P, Q, K]\n",
-    "N, P, Q, K = cutlass.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)\n",
+    "N, P, Q, K = cutlass_cppgen.Conv2d.output_size((N, H, W, C), (K, R, S, C), padding, stride, dilation)\n",
    "\n",
    "dtype = torch.float16\n",
    "type_A = torch.float16\n",
@ -111,7 +111,7 @@
   "source": [
    "## Declaring and running a Conv2d Fprop\n",
    "\n",
-    "We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n",
+    "We first show you how to run a Conv2d in the forward propagation. To get started, one only needs to provide the tensors declared above to the `cutlass_cppgen.op.Conv2dFprop` call. This sets up a default Conv2d fprop operation for the given device on which you are running. \n",
    "\n",
    "Assuming that we are runing on SM80, the default is a Conv2d that leverages FP16 Tensor Core operations.\n",
    "\n",
@ -125,7 +125,7 @@
   "outputs": [],
   "source": [
    "# Specifying `element_accumulator` is not required if it is the same as `element`\n",
-    "plan = cutlass.Conv2dFprop(element=dtype, element_accumulator=torch.float32)\n",
+    "plan = cutlass_cppgen.Conv2dFprop(element=dtype, element_accumulator=torch.float32)\n",
    "plan.run(input, weight, tensor_C, output, stride, padding, dilation, alpha, beta, print_module=print_module)"
   ]
  },
@ -133,7 +133,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "There are many other ways to construct a plan from `cutlass.op.Conv2dFprop` (e.g., by specifying the types of each operand, by providing representative tensors as input). For more details on these, see the documentation in the `cutlass.op.Conv2dFprop` constructor.\n",
+    "There are many other ways to construct a plan from `cutlass_cppgen.op.Conv2dFprop` (e.g., by specifying the types of each operand, by providing representative tensors as input). For more details on these, see the documentation in the `cutlass_cppgen.op.Conv2dFprop` constructor.\n",
    "\n",
    "We then compare the output to running the Conv2d using PyTorch. PyTorch use NCHW layout by default, so permutations are required."
   ]
@ -200,7 +200,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "plan_dgrad = cutlass.Conv2dDgrad(element=dtype, element_accumulator=torch.float32)\n",
+    "plan_dgrad = cutlass_cppgen.Conv2dDgrad(element=dtype, element_accumulator=torch.float32)\n",
    "plan_dgrad.run(grad_output, weight, tensor_C_dgrad, grad_input, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
    "\n",
    "grad_input_torch = alpha * torch.nn.grad.conv2d_input(\n",
@ -225,7 +225,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "plan_wgrad = cutlass.Conv2dWgrad(element=dtype, element_accumulator=torch.float32)\n",
+    "plan_wgrad = cutlass_cppgen.Conv2dWgrad(element=dtype, element_accumulator=torch.float32)\n",
    "plan_wgrad.run(grad_output, input, tensor_C_wgrad, grad_weight, stride, padding, dilation, alpha, beta, print_module=print_module)\n",
    "\n",
    "grad_weight_torch = alpha * torch.nn.grad.conv2d_weight(\n",