v4.0 update. (#2371)
This commit is contained in:
200
examples/python/CuTeDSL/ampere/smem_allocator.py
Normal file
200
examples/python/CuTeDSL/ampere/smem_allocator.py
Normal file
@ -0,0 +1,200 @@
|
||||
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
import cutlass.cute as cute
|
||||
import cutlass
|
||||
import torch
|
||||
import numpy as np
|
||||
from cutlass.cute.runtime import from_dlpack
|
||||
|
||||
"""
|
||||
A Shared Memory Allocator Example on NVIDIA Ampere architecture using CuTe DSL.
|
||||
|
||||
This example demonstrates how to allocate and manage shared memory in JIT kernels by using the SmemAllocator in CuTe DSL.
|
||||
It shows various ways to allocate different data structures in shared memory:
|
||||
|
||||
1. Struct allocation with natural and strict alignment
|
||||
2. Raw memory block allocation with custom alignment
|
||||
3. Array allocation with automatic alignment
|
||||
4. Tensor allocation with layout specification
|
||||
|
||||
The example includes:
|
||||
- Shared storage struct with mixed alignment requirements
|
||||
- Memory allocation patterns for different data types
|
||||
- Tensor operations on allocated memory
|
||||
|
||||
To run this example:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python examples/ampere/smem_allocator.py
|
||||
|
||||
The example will allocate shared memory, perform tensor operations, and verify the results.
|
||||
"""
|
||||
|
||||
|
||||
@cute.struct
|
||||
class complex:
|
||||
real: cutlass.Float32
|
||||
imag: cutlass.Float32
|
||||
|
||||
|
||||
# SharedStorage size is 512, alignment is 128
|
||||
@cute.struct
|
||||
class SharedStorage:
|
||||
# struct elements with natural alignment
|
||||
a: cute.struct.MemRange[cutlass.Float32, 32] # array
|
||||
b: cutlass.Int64 # saclar
|
||||
c: complex # nested struct
|
||||
# struct elements with strict alignment
|
||||
x: cute.struct.Align[
|
||||
cute.struct.MemRange[cutlass.Float32, 32],
|
||||
128,
|
||||
]
|
||||
y: cute.struct.Align[cutlass.Int32, 8]
|
||||
z: cute.struct.Align[complex, 16]
|
||||
|
||||
|
||||
@cute.kernel
|
||||
def kernel(
|
||||
const_a: cutlass.Constexpr,
|
||||
dst_a: cute.Tensor,
|
||||
const_b: cutlass.Constexpr,
|
||||
dst_b: cute.Tensor,
|
||||
const_c: cutlass.Constexpr,
|
||||
dst_c: cute.Tensor,
|
||||
):
|
||||
# Note: SMEM_SIZE bytes (specified in kernel().launch(smem=...)) can be reserved for developer to utilize
|
||||
# Note: alignment of inital allocator base ptr is 1024
|
||||
allocator = cutlass.utils.SmemAllocator()
|
||||
# base ptr of allocator points at: SMEM_ADDR_START (the starting address of available shared memory)
|
||||
|
||||
# -- Allocate a struct --
|
||||
# Note: when specified alignment, max(alignment, alignof(struct)) will be applied
|
||||
# reserves the section of struct in smem, elements in the struct can be accessed by ptr
|
||||
struct_in_smem = allocator.allocate(SharedStorage)
|
||||
# base ptr of allocator now points at: SMEM_ADDR_AFTER_STRUCT = SMEM_ADDR_START + aligned_size(struct)
|
||||
|
||||
# -- Allocate a block of memory --
|
||||
# reserves a section of 64 bytes in smem, align to 128 bytes, returns the section base ptr
|
||||
section_in_smem = allocator.allocate(64, byte_alignment=128)
|
||||
# base ptr of allocator now points at: SMEM_ADDR_AFTER_SECTION = SMEM_ADDR_AFTER_STRUCT + aligned_size(section)
|
||||
|
||||
# -- Allocate an array --
|
||||
# reserves an int64 array of size 14 in smem, returns the array base ptr
|
||||
array_in_smem = allocator.allocate_array(element_type=cutlass.Int64, num_elems=14)
|
||||
# base ptr of allocator now points at: SMEM_ADDR_AFTER_ARRAY = SMEM_ADDR_AFTER_SECTION + aligned_size(array)
|
||||
|
||||
# -- Allocate a tensor --
|
||||
# Note: use cute.ComposedLayout or cute.Layout to specify layout of tensor
|
||||
# Note: iterator swizzle with swizzle layout is currently not supported
|
||||
layout = cute.make_layout((16, 2))
|
||||
tensor_in_smem = allocator.allocate_tensor(
|
||||
element_type=cutlass.Float32, layout=layout, byte_alignment=32, swizzle=None
|
||||
)
|
||||
# base ptr of allocator now points at: SMEM_ADDR_AFTER_TENSOR = SMEM_ADDR_AFTER_ARRAY + aligned_size(tensor)
|
||||
|
||||
# ptr<f16, smem, align<1024>>
|
||||
# ptr<i64, smem, align<128>>
|
||||
# ptr<f32, smem, align<8>>
|
||||
print(struct_in_smem.a.data_ptr())
|
||||
print(struct_in_smem.b)
|
||||
print(struct_in_smem.c.real)
|
||||
# ptr<i8, smem, align<512>>
|
||||
print(section_in_smem)
|
||||
# ptr<i64, smem, align<64>>
|
||||
print(array_in_smem)
|
||||
# tensor<ptr<f16, smem, align<32>> o (16,4):(1,16)>
|
||||
print(tensor_in_smem)
|
||||
|
||||
# fill MemRange tensor in struct and copy to dst
|
||||
a_tensor = struct_in_smem.a.get_tensor(cute.make_layout((8, 4)))
|
||||
a_tensor.fill(const_a)
|
||||
cute.printf("cute.struct.MemRange: {}", a_tensor)
|
||||
dst_a.store(a_tensor.load())
|
||||
|
||||
# convert block of smem to fill tensor and copy to dst
|
||||
layout = cute.make_layout((8, 2))
|
||||
sec_ptr = cute.recast_ptr(section_in_smem, dtype=cutlass.Float32)
|
||||
sec_tensor = cute.make_tensor(sec_ptr, layout)
|
||||
sec_tensor.fill(const_b)
|
||||
cute.printf("block of memory: {}", sec_tensor)
|
||||
dst_b.store(sec_tensor.load())
|
||||
|
||||
# fill allocated tensor in smem and copy to dst
|
||||
tensor_in_smem.fill(const_c)
|
||||
cute.printf("tensor in smem: {}", tensor_in_smem)
|
||||
dst_c.store(tensor_in_smem.load())
|
||||
|
||||
|
||||
@cute.jit
|
||||
def run_allocation_kernel(
|
||||
const_a: cutlass.Constexpr,
|
||||
dst_a: cute.Tensor,
|
||||
const_b: cutlass.Constexpr,
|
||||
dst_b: cute.Tensor,
|
||||
const_c: cutlass.Constexpr,
|
||||
dst_c: cute.Tensor,
|
||||
):
|
||||
# additional size for the example, 64(section) + 112(array) + 128(tensor) < 384
|
||||
addtional_bytes = 384
|
||||
# Note: launch shared memory size is: SMEM_SIZE = 512 + 384 = 896 bytes
|
||||
kernel(const_a, dst_a, const_b, dst_b, const_c, dst_c).launch(
|
||||
grid=(1, 1, 1),
|
||||
block=(1, 1, 1),
|
||||
smem=SharedStorage.size_in_bytes() + addtional_bytes,
|
||||
)
|
||||
|
||||
|
||||
def veify_allocation_kernel(const_a, const_b, const_c):
|
||||
dst_a = torch.zeros((8, 4), dtype=torch.float32, device="cuda")
|
||||
dst_b = torch.zeros((8, 2), dtype=torch.float32, device="cuda")
|
||||
dst_c = torch.zeros((16, 2), dtype=torch.float32, device="cuda")
|
||||
|
||||
run_allocation_kernel(
|
||||
const_a,
|
||||
from_dlpack(dst_a),
|
||||
const_b,
|
||||
from_dlpack(dst_b),
|
||||
const_c,
|
||||
from_dlpack(dst_c),
|
||||
)
|
||||
|
||||
np.testing.assert_equal(const_a, dst_a.detach().cpu().numpy()[0])
|
||||
np.testing.assert_equal(const_b, dst_b.detach().cpu().numpy()[0])
|
||||
np.testing.assert_equal(const_c, dst_c.detach().cpu().numpy()[0])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# prepare cuda context
|
||||
cutlass.cuda.initialize_cuda_context()
|
||||
# An example for shared memory allocation
|
||||
const_a = 0.5
|
||||
const_b = 1.0
|
||||
const_c = 2.0
|
||||
veify_allocation_kernel(const_a, const_b, const_c)
|
||||
Reference in New Issue
Block a user