Files
cutlass/examples/python/CuTeDSL/ampere/smem_allocator.py
2025-06-06 02:39:20 -04:00

201 lines
7.5 KiB
Python

# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import cutlass.cute as cute
import cutlass
import torch
import numpy as np
from cutlass.cute.runtime import from_dlpack
"""
A Shared Memory Allocator Example on NVIDIA Ampere architecture using CuTe DSL.
This example demonstrates how to allocate and manage shared memory in JIT kernels by using the SmemAllocator in CuTe DSL.
It shows various ways to allocate different data structures in shared memory:
1. Struct allocation with natural and strict alignment
2. Raw memory block allocation with custom alignment
3. Array allocation with automatic alignment
4. Tensor allocation with layout specification
The example includes:
- Shared storage struct with mixed alignment requirements
- Memory allocation patterns for different data types
- Tensor operations on allocated memory
To run this example:
.. code-block:: bash
python examples/ampere/smem_allocator.py
The example will allocate shared memory, perform tensor operations, and verify the results.
"""
@cute.struct
class complex:
real: cutlass.Float32
imag: cutlass.Float32
# SharedStorage size is 512, alignment is 128
@cute.struct
class SharedStorage:
# struct elements with natural alignment
a: cute.struct.MemRange[cutlass.Float32, 32] # array
b: cutlass.Int64 # saclar
c: complex # nested struct
# struct elements with strict alignment
x: cute.struct.Align[
cute.struct.MemRange[cutlass.Float32, 32],
128,
]
y: cute.struct.Align[cutlass.Int32, 8]
z: cute.struct.Align[complex, 16]
@cute.kernel
def kernel(
const_a: cutlass.Constexpr,
dst_a: cute.Tensor,
const_b: cutlass.Constexpr,
dst_b: cute.Tensor,
const_c: cutlass.Constexpr,
dst_c: cute.Tensor,
):
# Note: SMEM_SIZE bytes (specified in kernel().launch(smem=...)) can be reserved for developer to utilize
# Note: alignment of inital allocator base ptr is 1024
allocator = cutlass.utils.SmemAllocator()
# base ptr of allocator points at: SMEM_ADDR_START (the starting address of available shared memory)
# -- Allocate a struct --
# Note: when specified alignment, max(alignment, alignof(struct)) will be applied
# reserves the section of struct in smem, elements in the struct can be accessed by ptr
struct_in_smem = allocator.allocate(SharedStorage)
# base ptr of allocator now points at: SMEM_ADDR_AFTER_STRUCT = SMEM_ADDR_START + aligned_size(struct)
# -- Allocate a block of memory --
# reserves a section of 64 bytes in smem, align to 128 bytes, returns the section base ptr
section_in_smem = allocator.allocate(64, byte_alignment=128)
# base ptr of allocator now points at: SMEM_ADDR_AFTER_SECTION = SMEM_ADDR_AFTER_STRUCT + aligned_size(section)
# -- Allocate an array --
# reserves an int64 array of size 14 in smem, returns the array base ptr
array_in_smem = allocator.allocate_array(element_type=cutlass.Int64, num_elems=14)
# base ptr of allocator now points at: SMEM_ADDR_AFTER_ARRAY = SMEM_ADDR_AFTER_SECTION + aligned_size(array)
# -- Allocate a tensor --
# Note: use cute.ComposedLayout or cute.Layout to specify layout of tensor
# Note: iterator swizzle with swizzle layout is currently not supported
layout = cute.make_layout((16, 2))
tensor_in_smem = allocator.allocate_tensor(
element_type=cutlass.Float32, layout=layout, byte_alignment=32, swizzle=None
)
# base ptr of allocator now points at: SMEM_ADDR_AFTER_TENSOR = SMEM_ADDR_AFTER_ARRAY + aligned_size(tensor)
# ptr<f16, smem, align<1024>>
# ptr<i64, smem, align<128>>
# ptr<f32, smem, align<8>>
print(struct_in_smem.a.data_ptr())
print(struct_in_smem.b)
print(struct_in_smem.c.real)
# ptr<i8, smem, align<512>>
print(section_in_smem)
# ptr<i64, smem, align<64>>
print(array_in_smem)
# tensor<ptr<f16, smem, align<32>> o (16,4):(1,16)>
print(tensor_in_smem)
# fill MemRange tensor in struct and copy to dst
a_tensor = struct_in_smem.a.get_tensor(cute.make_layout((8, 4)))
a_tensor.fill(const_a)
cute.printf("cute.struct.MemRange: {}", a_tensor)
dst_a.store(a_tensor.load())
# convert block of smem to fill tensor and copy to dst
layout = cute.make_layout((8, 2))
sec_ptr = cute.recast_ptr(section_in_smem, dtype=cutlass.Float32)
sec_tensor = cute.make_tensor(sec_ptr, layout)
sec_tensor.fill(const_b)
cute.printf("block of memory: {}", sec_tensor)
dst_b.store(sec_tensor.load())
# fill allocated tensor in smem and copy to dst
tensor_in_smem.fill(const_c)
cute.printf("tensor in smem: {}", tensor_in_smem)
dst_c.store(tensor_in_smem.load())
@cute.jit
def run_allocation_kernel(
const_a: cutlass.Constexpr,
dst_a: cute.Tensor,
const_b: cutlass.Constexpr,
dst_b: cute.Tensor,
const_c: cutlass.Constexpr,
dst_c: cute.Tensor,
):
# additional size for the example, 64(section) + 112(array) + 128(tensor) < 384
addtional_bytes = 384
# Note: launch shared memory size is: SMEM_SIZE = 512 + 384 = 896 bytes
kernel(const_a, dst_a, const_b, dst_b, const_c, dst_c).launch(
grid=(1, 1, 1),
block=(1, 1, 1),
smem=SharedStorage.size_in_bytes() + addtional_bytes,
)
def veify_allocation_kernel(const_a, const_b, const_c):
dst_a = torch.zeros((8, 4), dtype=torch.float32, device="cuda")
dst_b = torch.zeros((8, 2), dtype=torch.float32, device="cuda")
dst_c = torch.zeros((16, 2), dtype=torch.float32, device="cuda")
run_allocation_kernel(
const_a,
from_dlpack(dst_a),
const_b,
from_dlpack(dst_b),
const_c,
from_dlpack(dst_c),
)
np.testing.assert_equal(const_a, dst_a.detach().cpu().numpy()[0])
np.testing.assert_equal(const_b, dst_b.detach().cpu().numpy()[0])
np.testing.assert_equal(const_c, dst_c.detach().cpu().numpy()[0])
if __name__ == "__main__":
# prepare cuda context
cutlass.cuda.initialize_cuda_context()
# An example for shared memory allocation
const_a = 0.5
const_b = 1.0
const_c = 2.0
veify_allocation_kernel(const_a, const_b, const_c)