v4.2 release. (#2587)
* Fix default cluster callback values to 1 to avoid profiler failure when these values are not set in command line. * v4.2 release.
This commit is contained in:
@ -69,7 +69,7 @@
|
||||
"import numpy as np\n",
|
||||
"import random\n",
|
||||
"\n",
|
||||
"import cutlass\n",
|
||||
"import cutlass_cppgen\n",
|
||||
"\n",
|
||||
"# This controls whether the C++ GEMM declaration will be printed at each step. \n",
|
||||
"# Set to `False` to omit this information.\n",
|
||||
@ -106,7 +106,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Declaring and running a GEMM\n",
|
||||
"To get started, one only needs to provide the tensors declared above to the `cutlass.op.Gemm` call.\n",
|
||||
"To get started, one only needs to provide the tensors declared above to the `cutlass_cppgen.op.Gemm` call.\n",
|
||||
"This sets up a default GEMM operation for the given device on which you are running.\n",
|
||||
"\n",
|
||||
"Assuming that we are running on SM80, this default to using a GEMM that leverages FP16 Tensor Core operations.\n",
|
||||
@ -123,7 +123,7 @@
|
||||
"source": [
|
||||
"# We specify `element_accumulator` here so as to match the kernel run by NumPy below. However,\n",
|
||||
"# specifying `element_accumulator` is not required if it is the same as `element`\n",
|
||||
"plan = cutlass.Gemm(element=dtype, layout=cutlass.LayoutType.RowMajor, element_accumulator=np.float32)\n",
|
||||
"plan = cutlass_cppgen.Gemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor, element_accumulator=np.float32)\n",
|
||||
"plan.run(tensor_A, tensor_B, tensor_C, tensor_D, print_module=print_module)"
|
||||
]
|
||||
},
|
||||
@ -133,7 +133,7 @@
|
||||
"id": "4a5856de",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"There are many other ways to construct a plan from `cutlass.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass.op.Gemm` constructor."
|
||||
"There are many other ways to construct a plan from `cutlass_cppgen.op.Gemm` (e.g., by specifiying they types and layouts of each operand, by providing representative tensors as inputs). For more details on these, see the documentation in the `cutlass_cppgen.op.Gemm` constructor."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -172,7 +172,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Changing operation modes\n",
|
||||
"By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to `cutlass.op.Gemm` is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.\n",
|
||||
"By default, the CUTLASS Python interface will try to use Tensor Core operations whenever possible. If the configuration provided to `cutlass_cppgen.op.Gemm` is not supported on Tensor Cores, the interface will fall back to using a SIMT kernel.\n",
|
||||
"\n",
|
||||
"The operation mode currently in use can be returned via the `plan.opclass` property. In this case Tensor Core operations."
|
||||
]
|
||||
@ -197,7 +197,7 @@
|
||||
"\n",
|
||||
"As is shown in the printed output, the emitted kernel uses template parameters that fit CUTLASS's SIMT GEMMs.\n",
|
||||
"\n",
|
||||
"Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization."
|
||||
"Also notice that, this time around, we provided tensor parameters to `plan.run()`. One is free to provide different parameters to `plan.run()` than were passed in at the initial call to `cutlass_cppgen.op.Gemm`, provided that the passed-in tensors have the same data type and layout as those passed in on intialization."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -208,7 +208,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tensor_D_simt = np.zeros(tensor_C.shape).astype(type_D)\n",
|
||||
"plan.opclass = cutlass.OpcodeClass.Simt\n",
|
||||
"plan.opclass = cutlass_cppgen.OpcodeClass.Simt\n",
|
||||
"plan.run(tensor_A, tensor_B, tensor_C, tensor_D_simt, alpha, beta, print_module=print_module)"
|
||||
]
|
||||
},
|
||||
@ -262,7 +262,7 @@
|
||||
"alpha = np.float16(1.)\n",
|
||||
"beta = np.float16(2.)\n",
|
||||
"\n",
|
||||
"plan.opclass = cutlass.OpcodeClass.TensorOp\n",
|
||||
"plan.opclass = cutlass_cppgen.OpcodeClass.TensorOp\n",
|
||||
"plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
|
||||
]
|
||||
},
|
||||
@ -336,13 +336,13 @@
|
||||
"# Stream K is exposed through the threadblock swizzle method for pre-SM90 kernels,\n",
|
||||
"# and via the tile_scheduler attribute of the TileDescription for post-SM90 kernels\n",
|
||||
"if plan.cc < 90:\n",
|
||||
" plan.swizzling_functor = cutlass.swizzle.ThreadblockSwizzleStreamK\n",
|
||||
" plan.swizzling_functor = cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK\n",
|
||||
" plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)\n",
|
||||
"else:\n",
|
||||
" # Stream-K is currently only supported for warp-specialized cooperative kernels\n",
|
||||
" td.kernel_schedule = cutlass.KernelScheduleType.TmaWarpSpecializedCooperative\n",
|
||||
" td.epilogue_schedule = cutlass.EpilogueScheduleType.TmaWarpSpecializedCooperative\n",
|
||||
" td.tile_scheduler = cutlass.TileSchedulerType.StreamK\n",
|
||||
" td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative\n",
|
||||
" td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative\n",
|
||||
" td.tile_scheduler = cutlass_cppgen.TileSchedulerType.StreamK\n",
|
||||
"\n",
|
||||
" plan.compile(td)\n",
|
||||
" plan.run(tensor_A, tensor_B, tensor_C, tensor_D, alpha, beta, print_module=print_module)"
|
||||
@ -391,12 +391,12 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from cutlass.backend.utils.device import device_cc\n",
|
||||
"from cutlass_cppgen.backend.utils.device import device_cc\n",
|
||||
"\n",
|
||||
"# 3xTF32 requires SM80 or higher\n",
|
||||
"if device_cc() >= 80:\n",
|
||||
" plan = cutlass.op.Gemm(element=np.float32, layout=cutlass.LayoutType.RowMajor)\n",
|
||||
" plan.math_operation = cutlass.MathOperation.multiply_add_fast_f32\n",
|
||||
" plan = cutlass_cppgen.op.Gemm(element=np.float32, layout=cutlass_cppgen.LayoutType.RowMajor)\n",
|
||||
" plan.math_operation = cutlass_cppgen.MathOperation.multiply_add_fast_f32\n",
|
||||
"\n",
|
||||
" # Create input/output tensors in FP32\n",
|
||||
" A, B = [np.ones((128, 128)).astype(np.float32) for _ in range(2)]\n",
|
||||
@ -433,9 +433,9 @@
|
||||
"\n",
|
||||
"# FP8 is supported through the CUTLASS Python interface on SM90 and higher\n",
|
||||
"if device_cc() >= 90:\n",
|
||||
" plan = cutlass.op.Gemm(element=torch.float8_e4m3fn, element_C=torch.float32, element_accumulator=torch.float32,\n",
|
||||
" layout_A=cutlass.LayoutType.RowMajor, layout_B=cutlass.LayoutType.ColumnMajor,\n",
|
||||
" layout_C=cutlass.LayoutType.ColumnMajor)\n",
|
||||
" plan = cutlass_cppgen.op.Gemm(element=torch.float8_e4m3fn, element_C=torch.float32, element_accumulator=torch.float32,\n",
|
||||
" layout_A=cutlass_cppgen.LayoutType.RowMajor, layout_B=cutlass_cppgen.LayoutType.ColumnMajor,\n",
|
||||
" layout_C=cutlass_cppgen.LayoutType.ColumnMajor)\n",
|
||||
"\n",
|
||||
" # Create input/output tensors in FP8\n",
|
||||
" A, B = [torch.ones((128, 128)).to(torch.float8_e4m3fn).to(\"cuda\") for _ in range(2)]\n",
|
||||
|
||||
Reference in New Issue
Block a user