# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # 3. Neither the name of the copyright holder nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse import time from typing import Type import cuda.bindings.driver as cuda import torch import cutlass import cutlass.cute as cute import cutlass.cute.testing as testing import cutlass.torch as cutlass_torch from cutlass.cute.runtime import from_dlpack """ An Elementwise Addition Example using CuTe DSL. This example kernel copies data from global memory to register memory (rmem), performs the elementwise addition operation, and stores the result back to global memory. Primary goals of this example are to demonstrate how basic global memory copies can be expressed in CuTe DSL and illustrate canonical partitioning patterns in CuTe. It also implements canonical predication for tensors whose shape is not multiple of tile size to guard OOB reads. Thread-value (or TV) layouts are central to canonical partitioning patterns in CuTe. They provide a mapping from thread and a thread's value to the set of coordinates within a tile that we have sliced out from a data tensor. The input tensors are row-major layout, that leading dimension is the right most dimension. In order to efficiently copy data from global memory, we must map threads contiguously on row dimension. Thread ID mapping to 2D coordinates with layout `(4,32):(32,1)`: +----+----+----+----+-----+----+ | | 0 | 1 | 2 | ... | 31 | +----+----+----+----+-----+----+ | 0 | T0 | T1 | T2 | ... | T31| +----+----+----+----+-----+----+ | 1 |T32 |T33 |T34 | ... |T63 | +----+----+----+----+-----+----+ | 2 |T64 |T65 |T66 | ... |T95 | +----+----+----+----+-----+----+ | 3 |T96 |T97 |T98 | ... |T127| +----+----+----+----+-----+----+ As Ampere GPU supports a maximum of 128bit per load/store instruction and each element is 32bit, we can load 4 elements per instruction. Having additional contiguous values allows for vectorization across threads (coalesced accesses) and is required for saturating the memory bandwidth. We use `(4,4):(4,1)` as the val layout in this example. Notice that the major mode is the same as the major mode of the input tensor - without which vectorization would not be possible. If you already know the TV layout you want to use for your tiled copy, CuTe DSL provides utility `cute.make_layout_tv` to build the tiled copy type around it and the atom of your choice. .. code-block:: python thr_layout = cute.make_layout((4, 32), stride=(32, 1)) val_layout = cute.make_layout((4, 4), stride=(4, 1)) tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout) # Tile input tensor to thread blocks: ((TileM,TileN),(RestM,RestN)) gA = cute.zipped_divide(mA, tiler_mn) Then we can build tiled copy for input and output tensors with `cute.make_tiled_copy_tv` utility, which infers the tiler and tv layout for the tiled copy automatically, where `tiler` is the tile size per thread block and `tv_layout` is the TV layout which maps thread index and inter-thread index of data array per thread to logical coordinates of elements in input and output tensors. .. code-block:: python blkA = gA[((None, None), bidx)] # (TileM,TileN) copy_atom_load = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gA.element_type) tiled_copy_A = cute.make_tiled_copy_tv(copy_atom_load, thr_layout, val_layout) # get slice of tiled_copy_A for current thread thr_copy_A = tiled_copy_A.get_slice(tidx) # partition per thread block tensor as source of tiled copy thrA = thr_copy_A.partition_S(blkA) # allocate fragment for gmem->rmem frgA = cute.make_fragment_like(thrA) # copy data from global memory to register memory cute.copy(copy_atom_load, thrA, frgA) To run this example: .. code-block:: bash python examples/ampere/elementwise_add.py --M 3 --N 12 python examples/ampere/elementwise_add.py --M 1024 --N 512 python examples/ampere/elementwise_add.py --M 1024 --N 1024 --benchmark --warmup_iterations 2 --iterations 1000 To collect performance with NCU profiler: .. code-block:: bash # Don't iterate too many times when profiling with ncu ncu python examples/ampere/elementwise_add.py --M 2048 --N 2048 --benchmark --iterations 10 --skip_ref_check """ @cute.kernel def elementwise_add_kernel( gA: cute.Tensor, gB: cute.Tensor, gC: cute.Tensor, cC: cute.Tensor, # coordinate tensor shape: cute.Shape, thr_layout: cute.Layout, val_layout: cute.Layout, ): tidx, _, _ = cute.arch.thread_idx() bidx, _, _ = cute.arch.block_idx() # slice for CTAs # logical id -> address blk_coord = ((None, None), bidx) blkA = gA[blk_coord] # (TileM,TileN) blkB = gB[blk_coord] # (TileM,TileN) blkC = gC[blk_coord] # (TileM,TileN) blkCrd = cC[blk_coord] # (TileM, TileN) # Note: these prints only run at compile/jit time print(f"[DSL INFO] Sliced Tensors per thread block:") print(f"[DSL INFO] blkA = {blkA.type}") print(f"[DSL INFO] blkB = {blkB.type}") print(f"[DSL INFO] blkC = {blkC.type}") print(f"[DSL INFO] blkCrd = {blkCrd.type}") # # declare the atoms which will be used later for memory copy copy_atom_load = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gA.element_type) copy_atom_store = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gC.element_type) tiled_copy_A = cute.make_tiled_copy_tv(copy_atom_load, thr_layout, val_layout) tiled_copy_B = cute.make_tiled_copy_tv(copy_atom_load, thr_layout, val_layout) tiled_copy_C = cute.make_tiled_copy_tv(copy_atom_store, thr_layout, val_layout) thr_copy_A = tiled_copy_A.get_slice(tidx) thr_copy_B = tiled_copy_B.get_slice(tidx) thr_copy_C = tiled_copy_C.get_slice(tidx) thrA = thr_copy_A.partition_S(blkA) thrB = thr_copy_B.partition_S(blkB) thrC = thr_copy_C.partition_S(blkC) # allocate fragments for gmem->rmem frgA = cute.make_fragment_like(thrA) frgB = cute.make_fragment_like(thrB) frgC = cute.make_fragment_like(thrC) thrCrd = thr_copy_C.partition_S(blkCrd) frgPred = cute.make_fragment(thrCrd.shape, cutlass.Boolean) print(f"[DSL INFO] Sliced Tensors per thread:") print(f"[DSL INFO] thrA = {thrA.type}") print(f"[DSL INFO] thrB = {thrB.type}") print(f"[DSL INFO] thrC = {thrC.type}") print(f"[DSL INFO] thrCrd = {thrCrd.type}") for i in range(0, cute.size(frgPred), 1): val = cute.elem_less(thrCrd[i], shape) frgPred[i] = val # Print per thread predicate mask # if tidx == 0 and bidx == 0: # cute.printf("block_dim = {}", cute.arch.grid_dim()) # cute.printf("shape = {}", shape) # cute.print_tensor(thrA) # cute.print_tensor(thrB) # cute.print_tensor(frgPred) ########################################################## # Move data to reg address space ########################################################## cute.copy(copy_atom_load, thrA, frgA, pred=frgPred) cute.copy(copy_atom_load, thrB, frgB, pred=frgPred) # if tidx == 0 and bidx == 0: # cute.print_tensor(frgA) # cute.print_tensor(frgB) # Load data before use. The compiler will optimize the copy and load # operations to convert some memory ld/st into register uses. result = frgA.load() + frgB.load() # Save the results back to registers. Here we reuse b's registers. frgC.store(result) # Copy the results back to c cute.copy(copy_atom_store, frgC, thrC, pred=frgPred) @cute.jit def elementwise_add(mA, mB, mC, copy_bits: cutlass.Constexpr = 128): dtype = mA.element_type vector_size = copy_bits // dtype.width thr_layout = cute.make_ordered_layout((4, 32), order=(1, 0)) val_layout = cute.make_ordered_layout((4, vector_size), order=(1, 0)) tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout) print(f"[DSL INFO] Input Tensors:") print(f"[DSL INFO] mA = {mA.type}") print(f"[DSL INFO] mB = {mB.type}") print(f"[DSL INFO] Tiling Parameters:") print(f"[DSL INFO] tiler_mn = {tiler_mn} per thread block") print(f"[DSL INFO] tv_layout = {tv_layout}") gA = cute.zipped_divide(mA, tiler_mn) # ((TileM,TileN),(RestM,RestN)) gB = cute.zipped_divide(mB, tiler_mn) # ((TileM,TileN),(RestM,RestN)) gC = cute.zipped_divide(mC, tiler_mn) # ((TileM,TileN),(RestM,RestN)) print(f"[DSL INFO] Tiled Tensors:") print(f"[DSL INFO] gA = {gA.type}") print(f"[DSL INFO] gB = {gB.type}") print(f"[DSL INFO] gC = {gC.type}") idC = cute.make_identity_tensor(mC.shape) cC = cute.zipped_divide(idC, tiler=tiler_mn) print(f"[DSL INFO] coord tensor = {cC.type}") elementwise_add_kernel(gA, gB, gC, cC, mC.shape, thr_layout, val_layout).launch( grid=[cute.size(gC, mode=[1]), 1, 1], block=[cute.size(tv_layout, mode=[0]), 1, 1], ) def run_elementwise_add( M, N, dtype: Type[cutlass.Numeric], is_a_dynamic_layout=False, is_b_dynamic_layout=False, is_result_dynamic_layout=False, skip_ref_check=False, benchmark=True, warmup_iterations=2, iterations=200, ): print(f"\nRunning Elementwise Add test with:") print(f"Tensor dimensions: [{M}, {N}]") print(f"Input and Output Data type: {dtype}") torch_dtype = cutlass_torch.dtype(dtype) if dtype.is_integer: a = torch.randint(0, 10, (M, N), device=torch.device("cuda"), dtype=torch_dtype) b = torch.randint(0, 10, (M, N), device=torch.device("cuda"), dtype=torch_dtype) else: a = torch.randn(M, N, device=torch.device("cuda"), dtype=torch_dtype) b = torch.randn(M, N, device=torch.device("cuda"), dtype=torch_dtype) c = torch.zeros_like(a) print(f"Input tensor shapes:") print(f"a: {a.shape}, dtype: {a.dtype}") print(f"b: {b.shape}, dtype: {b.dtype}") print(f"c: {c.shape}, dtype: {c.dtype}\n") if not is_a_dynamic_layout: a_tensor = from_dlpack(a).mark_layout_dynamic() else: a_tensor = a if not is_b_dynamic_layout: b_tensor = from_dlpack(b).mark_layout_dynamic() else: b_tensor = b if not is_result_dynamic_layout: c_tensor = from_dlpack(c).mark_layout_dynamic() else: c_tensor = c print("Compiling kernel with cute.compile ...") start_time = time.time() compiled_func = cute.compile(elementwise_add, a_tensor, b_tensor, c_tensor) compilation_time = time.time() - start_time print(f"Compilation time: {compilation_time:.4f} seconds") print("Executing vector add kernel...") # Get current CUstream from torch current_stream = cutlass_torch.current_stream() if not skip_ref_check: compiled_func(a_tensor, b_tensor, c_tensor) print("Verifying results...") torch.testing.assert_close(a + b, c) print("Results verified successfully!") if not benchmark: return def generate_tensors(): if dtype.is_integer: a = torch.randint( 0, 10, (M, N), device=torch.device("cuda"), dtype=torch_dtype ) b = torch.randint( 0, 10, (M, N), device=torch.device("cuda"), dtype=torch_dtype ) else: a = torch.randn(M, N, device=torch.device("cuda"), dtype=torch_dtype) b = torch.randn(M, N, device=torch.device("cuda"), dtype=torch_dtype) c = torch.zeros_like(a) if not is_a_dynamic_layout: a_tensor = from_dlpack(a).mark_layout_dynamic() else: a_tensor = a if not is_b_dynamic_layout: b_tensor = from_dlpack(b).mark_layout_dynamic() else: b_tensor = b if not is_result_dynamic_layout: c_tensor = from_dlpack(c).mark_layout_dynamic() else: c_tensor = c return testing.JitArguments(a_tensor, b_tensor, c_tensor) avg_time_us = testing.benchmark( compiled_func, workspace_generator=generate_tensors, workspace_count=10, warmup_iterations=warmup_iterations, iterations=iterations, ) # Print execution results print(f"Kernel execution time: {avg_time_us / 1e3:.4f} ms") print( f"Achieved memory throughput: {(3 * a.numel() * dtype.width // 8) / (avg_time_us / 1e6) / 1e9:.2f} GB/s" ) print(f"First few elements of result: \n{c[:3, :3]}") if __name__ == "__main__": parser = argparse.ArgumentParser( description="example of elementwise add to demonstrate the numpy/pytorch as input for kernels" ) parser.add_argument("--M", default=1024, type=int) parser.add_argument("--N", default=1024, type=int) parser.add_argument("--warmup_iterations", default=2, type=int) parser.add_argument("--iterations", default=100, type=int) parser.add_argument("--skip_ref_check", action="store_true") parser.add_argument("--benchmark", action="store_true") args = parser.parse_args() if not torch.cuda.is_available(): raise RuntimeError(f"Ampere GPU is required to run this example!") run_elementwise_add( args.M, args.N, dtype=cutlass.Float32, is_a_dynamic_layout=True, is_b_dynamic_layout=True, is_result_dynamic_layout=True, skip_ref_check=args.skip_ref_check, benchmark=args.benchmark, warmup_iterations=args.warmup_iterations, iterations=args.iterations, ) print("\nPASS")