# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-3-Clause # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # 3. Neither the name of the copyright holder nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import cutlass.cute as cute import cutlass import torch import numpy as np from cutlass.cute.runtime import from_dlpack """ A Shared Memory Allocator Example on NVIDIA Ampere architecture using CuTe DSL. This example demonstrates how to allocate and manage shared memory in JIT kernels by using the SmemAllocator in CuTe DSL. It shows various ways to allocate different data structures in shared memory: 1. Struct allocation with natural and strict alignment 2. Raw memory block allocation with custom alignment 3. Array allocation with automatic alignment 4. Tensor allocation with layout specification The example includes: - Shared storage struct with mixed alignment requirements - Memory allocation patterns for different data types - Tensor operations on allocated memory To run this example: .. code-block:: bash python examples/ampere/smem_allocator.py The example will allocate shared memory, perform tensor operations, and verify the results. """ @cute.struct class complex: real: cutlass.Float32 imag: cutlass.Float32 # SharedStorage size is 512, alignment is 128 @cute.struct class SharedStorage: # struct elements with natural alignment a: cute.struct.MemRange[cutlass.Float32, 32] # array b: cutlass.Int64 # saclar c: complex # nested struct # struct elements with strict alignment x: cute.struct.Align[ cute.struct.MemRange[cutlass.Float32, 32], 128, ] y: cute.struct.Align[cutlass.Int32, 8] z: cute.struct.Align[complex, 16] @cute.kernel def kernel( const_a: cutlass.Constexpr, dst_a: cute.Tensor, const_b: cutlass.Constexpr, dst_b: cute.Tensor, const_c: cutlass.Constexpr, dst_c: cute.Tensor, ): # Note: SMEM_SIZE bytes (specified in kernel().launch(smem=...)) can be reserved for developer to utilize # Note: alignment of inital allocator base ptr is 1024 allocator = cutlass.utils.SmemAllocator() # base ptr of allocator points at: SMEM_ADDR_START (the starting address of available shared memory) # -- Allocate a struct -- # Note: when specified alignment, max(alignment, alignof(struct)) will be applied # reserves the section of struct in smem, elements in the struct can be accessed by ptr struct_in_smem = allocator.allocate(SharedStorage) # base ptr of allocator now points at: SMEM_ADDR_AFTER_STRUCT = SMEM_ADDR_START + aligned_size(struct) # -- Allocate a block of memory -- # reserves a section of 64 bytes in smem, align to 128 bytes, returns the section base ptr section_in_smem = allocator.allocate(64, byte_alignment=128) # base ptr of allocator now points at: SMEM_ADDR_AFTER_SECTION = SMEM_ADDR_AFTER_STRUCT + aligned_size(section) # -- Allocate an array -- # reserves an int64 array of size 14 in smem, returns the array base ptr array_in_smem = allocator.allocate_array(element_type=cutlass.Int64, num_elems=14) # base ptr of allocator now points at: SMEM_ADDR_AFTER_ARRAY = SMEM_ADDR_AFTER_SECTION + aligned_size(array) # -- Allocate a tensor -- # Note: use cute.ComposedLayout or cute.Layout to specify layout of tensor # Note: iterator swizzle with swizzle layout is currently not supported layout = cute.make_layout((16, 2)) tensor_in_smem = allocator.allocate_tensor( element_type=cutlass.Float32, layout=layout, byte_alignment=32, swizzle=None ) # base ptr of allocator now points at: SMEM_ADDR_AFTER_TENSOR = SMEM_ADDR_AFTER_ARRAY + aligned_size(tensor) # ptr> # ptr> # ptr> print(struct_in_smem.a.data_ptr()) print(struct_in_smem.b) print(struct_in_smem.c.real) # ptr> print(section_in_smem) # ptr> print(array_in_smem) # tensor> o (16,4):(1,16)> print(tensor_in_smem) # fill MemRange tensor in struct and copy to dst a_tensor = struct_in_smem.a.get_tensor(cute.make_layout((8, 4))) a_tensor.fill(const_a) cute.printf("cute.struct.MemRange: {}", a_tensor) dst_a.store(a_tensor.load()) # convert block of smem to fill tensor and copy to dst layout = cute.make_layout((8, 2)) sec_ptr = cute.recast_ptr(section_in_smem, dtype=cutlass.Float32) sec_tensor = cute.make_tensor(sec_ptr, layout) sec_tensor.fill(const_b) cute.printf("block of memory: {}", sec_tensor) dst_b.store(sec_tensor.load()) # fill allocated tensor in smem and copy to dst tensor_in_smem.fill(const_c) cute.printf("tensor in smem: {}", tensor_in_smem) dst_c.store(tensor_in_smem.load()) @cute.jit def run_allocation_kernel( const_a: cutlass.Constexpr, dst_a: cute.Tensor, const_b: cutlass.Constexpr, dst_b: cute.Tensor, const_c: cutlass.Constexpr, dst_c: cute.Tensor, ): # additional size for the example, 64(section) + 112(array) + 128(tensor) < 384 addtional_bytes = 384 # Note: launch shared memory size is: SMEM_SIZE = 512 + 384 = 896 bytes kernel(const_a, dst_a, const_b, dst_b, const_c, dst_c).launch( grid=(1, 1, 1), block=(1, 1, 1), smem=SharedStorage.size_in_bytes() + addtional_bytes, ) def veify_allocation_kernel(const_a, const_b, const_c): dst_a = torch.zeros((8, 4), dtype=torch.float32, device="cuda") dst_b = torch.zeros((8, 2), dtype=torch.float32, device="cuda") dst_c = torch.zeros((16, 2), dtype=torch.float32, device="cuda") run_allocation_kernel( const_a, from_dlpack(dst_a), const_b, from_dlpack(dst_b), const_c, from_dlpack(dst_c), ) np.testing.assert_equal(const_a, dst_a.detach().cpu().numpy()[0]) np.testing.assert_equal(const_b, dst_b.detach().cpu().numpy()[0]) np.testing.assert_equal(const_c, dst_c.detach().cpu().numpy()[0]) if __name__ == "__main__": # prepare cuda context cutlass.cuda.initialize_cuda_context() # An example for shared memory allocation const_a = 0.5 const_b = 1.0 const_c = 2.0 veify_allocation_kernel(const_a, const_b, const_c)