Updates for 3.1 (#932)

This commit is contained in:
ANIKET SHIVAM
2023-04-29 06:34:27 -07:00
committed by GitHub
parent 6f8596ce3f
commit 7c04f95415
51 changed files with 1796 additions and 328 deletions

View File

@ -59,6 +59,8 @@ endif()
find_package(Doxygen QUIET)
################################################################################
#
# CUTLASS 3.x requires C++17
#
@ -80,6 +82,10 @@ endif()
message(STATUS "Default Install Location: ${CMAKE_INSTALL_PREFIX}")
set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
# 0 - Sanity, 1 - Release-Quality, 2 - Exhaustive
################################################################################
set(CUTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")
if(CUTLASS_ENABLE_HEADERS_ONLY)
@ -112,6 +118,8 @@ if (CUTLASS_ENABLE_TESTS)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
endif()
################################################################################
set(CUTLASS_NVCC_ARCHS_SUPPORTED "")
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.4 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 70 72 75 80 86 87)
@ -197,15 +205,16 @@ set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.")
set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.")
set(CUTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.")
################################################################################
#
# CUTLASS generator cmake configuration
#
set(CUTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma delimited list of operation name filters. Default '' means all operations are enabled.")
set(CUTLASS_LIBRARY_KERNELS "" CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
set(CUTLASS_LIBRARY_KERNELS ${CUTLASS_LIBRARY_KERNELS_INIT} CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
set(CUTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma delimited list of kernel names to exclude from build.")
# Test Levels L0, L1, L2
set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
################################################################################
set(CUTLASS_TEST_ENABLE_CACHED_RESULTS ON CACHE BOOL "Enable caching and reuse of test results in unit tests")
@ -225,6 +234,8 @@ if (CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED)
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED=1)
endif()
################################################################################
#
# CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
#
@ -650,14 +661,16 @@ function(cutlass_add_executable_tests NAME TARGET)
# DEPENDS: A list of targets or files on which this test is dependent.
# DEPENDEES: A list of targets which should depend on this test.
# TEST_COMMAND_OPTIONS: A list of variables (i.e. by reference params) which contain command line arguments
# to pass to the test executable. A unique test with suffix _0, _1, ... is generated for each set of
# to pass to the test executable. A unique test is generated for each set of
# options given. If this option is not used, a single test with no arguments is generated.
# TEST_COMMAND_OPTIONS_PREFIX: If provided, is added as a prefix to each TEST_COMMAND_OPTIONS value for
# generating the full variable name to be referenced.
# RESULT_CACHE_FILE: A file to be installed alongside the test executable with pre-computed
# test results to speed up test runtime.
#
set(options DISABLE_EXECUTABLE_INSTALL_RULE)
set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE)
set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE TEST_COMMAND_OPTIONS_PREFIX)
set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@ -701,7 +714,6 @@ function(cutlass_add_executable_tests NAME TARGET)
endif()
list(LENGTH __TEST_COMMAND_OPTIONS CMD_COUNT)
set(CMD_IDX 0)
if (CMD_COUNT GREATER 1)
add_custom_target(${NAME} DEPENDS ${TARGET} ${__DEPENDS})
@ -710,12 +722,12 @@ function(cutlass_add_executable_tests NAME TARGET)
endforeach()
endif()
foreach(CMD_OPTIONS ${__TEST_COMMAND_OPTIONS})
foreach(CMD_OPTIONS_VAR IN LISTS __TEST_COMMAND_OPTIONS)
if (CMD_COUNT GREATER 1)
set(TEST_NAME ${NAME}_${CMD_IDX})
string(TOLOWER "${NAME}_${CMD_OPTIONS_VAR}" TEST_NAME)
else()
set(TEST_NAME ${NAME})
string(TOLOWER "${NAME}" TEST_NAME)
endif()
# The following rigmarole is needed to deal with spaces and possible quotes in
@ -724,14 +736,14 @@ function(cutlass_add_executable_tests NAME TARGET)
# preserves any quotes. Note, they have to be in this order for it to work for
# all the use cases below.
set(CMD_OPTIONS ${${CMD_OPTIONS}})
list(JOIN CMD_OPTIONS " " TEST_COMMAND_OPTIONS)
separate_arguments(CMD_OPTIONS)
set(TEST_COMMAND_OPTIONS ${${__TEST_COMMAND_OPTIONS_PREFIX}${CMD_OPTIONS_VAR}})
list(JOIN TEST_COMMAND_OPTIONS " " TEST_COMMAND_OPTIONS)
separate_arguments(TEST_COMMAND_OPTIONS)
add_custom_target(
${TEST_NAME}
COMMAND
${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${TEST_COMMAND_OPTIONS}
DEPENDS
${TARGET}
)
@ -746,7 +758,7 @@ function(cutlass_add_executable_tests NAME TARGET)
add_test(
NAME c${TEST_NAME}
COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${TEST_COMMAND_OPTIONS}
)
set_tests_properties(c${TEST_NAME} PROPERTIES DISABLED ${__DISABLE_TESTS})
@ -756,18 +768,21 @@ function(cutlass_add_executable_tests NAME TARGET)
# To run the tests from an install package with tests enabled, we need to generate test files
# that don't rely on the current directory structure in build.
set(TEST_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/${NAME})
file(MAKE_DIRECTORY ${TEST_GEN_DIR})
set(TEST_NAME c${TEST_NAME})
set(TEST_EXE $<TARGET_FILE_NAME:${TARGET}>)
set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR})
configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY)
configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY)
file(GENERATE
OUTPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake"
INPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake"
OUTPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake"
INPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.config.cmake"
)
install(
FILES "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake"
FILES "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake"
DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest/
)
@ -775,8 +790,6 @@ function(cutlass_add_executable_tests NAME TARGET)
endif()
math(EXPR CMD_IDX "${CMD_IDX} + 1")
endforeach()
endfunction()
@ -787,6 +800,7 @@ if (CUTLASS_ENABLE_TOOLS)
add_dependencies(test_all test_profiler)
endif()
endif()
if (CUTLASS_ENABLE_EXAMPLES)
add_subdirectory(examples)
add_dependencies(test_all test_examples)

View File

@ -31,5 +31,6 @@
cutlass_example_add_executable(
08_turing_tensorop_gemm
turing_tensorop_gemm.cu
DISABLE_TESTS ON
)

View File

@ -31,5 +31,6 @@
cutlass_example_add_executable(
12_gemm_bias_relu
gemm_bias_relu.cu
DISABLE_TESTS ON
)

View File

@ -34,7 +34,7 @@
matrix multiply kernel to verify its correctness.
The CUTLASS Syrk template is instantiated in the function CutlassSsyrkNN. This is kernel computes
the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
the symmetric rank-k update (SYRK) using double-precision floating-point arithmetic and assumes
all matrices have column-major layout.
The threadblock tile size is chosen as 16x32x16 which offers good performance for large matrices.

View File

@ -34,7 +34,7 @@
matrix multiply kernel to verify its correctness.
The CUTLASS Trmm template is instantiated in the function CutlassStrmmNN. This is kernel computes
the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes
the triangular matrix product (TRMM) using double-precision floating-point arithmetic and assumes
all matrices have column-major layout.
The threadblock tile size is chosen as 64x64x16 which offers good performance for large matrices.

View File

@ -495,7 +495,7 @@ int main(int argc, const char **argv)
options.tensor_d.resize(options.problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from CUTLASS kernel
options.tensor_ref_d.resize(options.problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from reference kernel
// Fill matrix A on host with uniform-random data [4, -4]
// Fill matrix A on host with uniform-random data [2, -2]
cutlass::reference::host::TensorFillRandomUniform(
options.tensor_a.host_view(),
1,
@ -503,7 +503,7 @@ int main(int argc, const char **argv)
ElementA(-2),
0);
// Fill matrix B on host with uniform-random data [4, -4]
// Fill matrix B on host with uniform-random data [2, -2]
cutlass::reference::host::TensorFillRandomUniform(
options.tensor_b.host_view(),
1,
@ -511,7 +511,7 @@ int main(int argc, const char **argv)
ElementB(-2),
0);
// Fill matrix C on host with uniform-random data [4, -4]
// Fill matrix C on host with uniform-random data [2, -2]
cutlass::reference::host::TensorFillRandomUniform(
options.tensor_c.host_view(),
1,

View File

@ -84,9 +84,10 @@
therefore letting the builder pick the collective specialization.
CUTLASS builders make an attempt to pick the best schedule when `Auto` is provided such that the
assembled collctives have the best performance, but this is not a guarantee. A user relying on `Auto`
assembled collectives have the best performance, but this is not a guarantee. A user relying on `Auto`
may get a free performance upgrade with newer CUTLASS releases in case we can provide more optimized
implementations that the builder can transparently assemble for `Auto`.
implementations that the builder can transparently assemble for `Auto`. But a user should not rely on
`Auto` if they require a specific scheduling policy and/or stage count to be used.
If a user decides to let the builders pick the collective specialization via `Auto` schedules,
they must be used for both mainloop and epilogue alike to ensure compatibility between the
@ -99,11 +100,6 @@
in this manner remains the primary API for using CUTLASS 3 kernels. `CollectiveBuilder`s are
simply meant to be a convenience interface.
Note also that, while the selections made by CollectiveBuilder attempt to maximize performance, this is not
a guarantee. Furthermore, the behavior of the CollectiveBuilder when `Auto` parameters are provided is subject
to change in future CUTLASS releases -- do not rely on `Auto` if you require a specific scheduling policy and/or
stage count to be used.
Details of this example
-----------------------
This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified.

View File

@ -227,4 +227,17 @@ elect_one_leader_sync()
#endif
}
// Store value to remote shared memory in the cluster
CUTE_DEVICE
void
store_shared_remote(uint32_t value, uint32_t smem_addr, uint32_t mbarrier_addr, uint32_t dst_cta_rank)
{
#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
uint32_t dsmem_addr = set_block_rank(smem_addr, dst_cta_rank);
uint32_t remote_barrier_addr = set_block_rank(mbarrier_addr, dst_cta_rank);
asm volatile("st.async.shared::cluster.mbarrier::complete_tx::bytes.u32 [%0], %1, [%2];"
: : "r"(dsmem_addr), "r"(value), "r"(remote_barrier_addr));
#endif
}
} // end namespace cute

View File

@ -35,6 +35,7 @@
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/arch/cache_operation.h"
namespace cutlass {
namespace arch {
@ -45,7 +46,9 @@ template <
/// Fragment type to store loaded data
typename AccessType,
/// The bytes of loading
int LoadBytes
int LoadBytes,
/// Cache operation
CacheOperation::Kind cache_op = CacheOperation::Always
>
struct global_load;
@ -59,7 +62,7 @@ struct global_load;
#if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \
(__CUDACC_VER_MAJOR__ > 11)) && \
defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
#define CUTLASS_ENABLE_L2_PREFETCH 1
#else
#define CUTLASS_ENABLE_L2_PREFETCH 0
@ -71,7 +74,8 @@ struct global_load;
// keep the initializing code before ld.global
template <typename AccessType>
struct global_load<AccessType,
32
32,
CacheOperation::Always
> {
CUTLASS_DEVICE
global_load(AccessType &D, void const *ptr, bool pred_guard) {
@ -107,7 +111,40 @@ struct global_load<AccessType,
template <typename AccessType>
struct global_load<AccessType,
16
32,
CacheOperation::LastUse
> {
CUTLASS_DEVICE
global_load(AccessType &D, void const *ptr, bool pred_guard) {
uint4 *data = reinterpret_cast<uint4 *>(&D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %9, 0;\n"
" mov.b32 %0, %10;\n"
" mov.b32 %1, %11;\n"
" mov.b32 %2, %12;\n"
" mov.b32 %3, %13;\n"
" mov.b32 %4, %14;\n"
" mov.b32 %5, %15;\n"
" mov.b32 %6, %16;\n"
" mov.b32 %7, %17;\n"
" @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%8];\n"
" @p ld.global.lu.v4.u32 {%4, %5, %6, %7}, [%18];\n"
"}\n"
: "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w),
"=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w)
: "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
"r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
"r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
}
};
template <typename AccessType>
struct global_load<AccessType,
16,
CacheOperation::Always
> {
CUTLASS_DEVICE
global_load(AccessType &D, void const *ptr, bool pred_guard) {
@ -133,7 +170,31 @@ struct global_load<AccessType,
template <typename AccessType>
struct global_load<AccessType,
8
16,
CacheOperation::LastUse
> {
CUTLASS_DEVICE
global_load(AccessType &D, void const *ptr, bool pred_guard) {
uint4 &data = reinterpret_cast<uint4 &>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %5, 0;\n"
" mov.b32 %0, %6;\n"
" mov.b32 %1, %7;\n"
" mov.b32 %2, %8;\n"
" mov.b32 %3, %9;\n"
" @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
"}\n"
: "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
: "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
}
};
template <typename AccessType>
struct global_load<AccessType,
8,
CacheOperation::Always
> {
CUTLASS_DEVICE
global_load(AccessType &D, void const *ptr, bool pred_guard) {
@ -158,7 +219,30 @@ struct global_load<AccessType,
template <typename AccessType>
struct global_load<AccessType,
4
8,
CacheOperation::LastUse
> {
CUTLASS_DEVICE
global_load(AccessType &D, void const *ptr, bool pred_guard) {
uint2 &data = reinterpret_cast<uint2 &>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %3, 0;\n"
" mov.b32 %0, %4;\n"
" mov.b32 %1, %5;\n"
" @p ld.global.lu.v2.u32 {%0, %1}, [%2];\n"
"}\n"
: "=r"(data.x), "=r"(data.y)
: "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y));
}
};
template <typename AccessType>
struct global_load<AccessType,
4,
CacheOperation::Always
> {
CUTLASS_DEVICE
global_load(AccessType &D, void const *ptr, bool pred_guard) {
@ -182,7 +266,29 @@ struct global_load<AccessType,
template <typename AccessType>
struct global_load<AccessType,
2
4,
CacheOperation::LastUse
> {
CUTLASS_DEVICE
global_load(AccessType &D, void const *ptr, bool pred_guard) {
unsigned &data = reinterpret_cast<unsigned &>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %2, 0;\n"
" mov.b32 %0, %3;\n"
" @p ld.global.lu.u32 %0, [%1];\n"
"}\n"
: "=r"(data)
: "l"(ptr), "r"((int)pred_guard), "r"(data));
}
};
template <typename AccessType>
struct global_load<AccessType,
2,
CacheOperation::Always
> {
CUTLASS_DEVICE
global_load(AccessType &D, void const *ptr, bool pred_guard) {
@ -206,7 +312,29 @@ struct global_load<AccessType,
template <typename AccessType>
struct global_load<AccessType,
1
2,
CacheOperation::LastUse
> {
CUTLASS_DEVICE
global_load(AccessType &D, void const *ptr, bool pred_guard) {
uint16_t &data = reinterpret_cast<uint16_t &>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %2, 0;\n"
" mov.b16 %0, %3;\n"
" @p ld.global.lu.u16 %0, [%1];\n"
"}\n"
: "=h"(data)
: "l"(ptr), "r"((int)pred_guard), "h"(data));
}
};
template <typename AccessType>
struct global_load<AccessType,
1,
CacheOperation::Always
> {
CUTLASS_DEVICE
global_load(AccessType &D, void const *ptr, bool pred_guard) {

View File

@ -81,7 +81,6 @@ protected:
CUTLASS_DEVICE
static void red_release(int *ptr, int val)
{
#if !defined(CUTLASS_PYTHON_HOST_CC)
#if (__CUDA_ARCH__ >= 700)
/// SM70 and newer use memory consistency qualifiers
@ -94,7 +93,6 @@ protected:
__threadfence();
atomicAdd(ptr, val);
#endif // (__CUDA_ARCH__ >= 700)
#endif
}
@ -104,7 +102,6 @@ public:
CUTLASS_DEVICE
static void wait_lt(void *lock_ptr, int thread_idx, int flag_idx, int count)
{
#if !defined(CUTLASS_PYTHON_HOST_CC)
T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
if (thread_idx == 0)
@ -115,14 +112,12 @@ public:
}
__syncthreads();
#endif
}
/// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
CUTLASS_DEVICE
static void wait_eq(void *lock_ptr, int thread_idx, int flag_idx, T val = 1)
{
#if !defined(CUTLASS_PYTHON_HOST_CC)
T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
if (thread_idx == 0)
@ -132,13 +127,11 @@ public:
while(ld_acquire(flag_ptr) != val) {}
}
__syncthreads();
#endif
}
/// Uses thread[0] to wait for the specified count of signals on the given flag counter
CUTLASS_DEVICE
static void wait_eq_reset(void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
#if !defined(CUTLASS_PYTHON_HOST_CC)
T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
if (thread_idx == 0)
@ -149,14 +142,12 @@ public:
}
__syncthreads();
#endif
}
/// Increment the arrival count for a flag
CUTLASS_DEVICE
static void arrive_inc(void *lock_ptr, int thread_idx, int flag_idx)
{
#if !defined(CUTLASS_PYTHON_HOST_CC)
T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
__syncthreads();
@ -165,7 +156,6 @@ public:
{
red_release(flag_ptr, 1);
}
#endif
}
@ -173,7 +163,6 @@ public:
CUTLASS_DEVICE
static void arrive_range_inc(void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1)
{
#if !defined(CUTLASS_PYTHON_HOST_CC)
int flag_idx = first_flag_idx + thread_idx;
T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
@ -184,7 +173,6 @@ public:
if (thread_idx < count) {
red_release(flag_ptr, 1);
}
#endif
}
};

View File

@ -59,9 +59,7 @@ inline std::ostream &operator<<(std::ostream &out, dim3 d) {
/// Output operator for CUDA built-in error type
inline std::ostream &operator<<(std::ostream &out, cudaError_t error) {
#if !defined(CUTLASS_PYTHON_HOST_CC)
return out << cudaGetErrorString(error);
#endif
}
///////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -290,7 +290,7 @@ struct CollectiveBuilder<
AlignmentD,
Schedule,
cute::enable_if_t<cute::is_same_v<Schedule, TmaWarpSpecialized> ||
cute::is_same_v<Schedule, TmaWarpSpecializedCooperative> >> {
cute::is_same_v<Schedule, TmaWarpSpecializedCooperative> >> {
public:
// Passing void C disables source load
using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,
@ -302,16 +302,33 @@ public:
using ThreadOp = thread::LinearCombination<
ElementD, AlignmentD, ElementAccumulator, ElementCompute,
thread::ScaleType::Default, FloatRoundStyle::round_to_nearest, ElementC>;
ScaleType, FloatRoundStyle::round_to_nearest, ElementC>;
private:
using Impl = detail::TmaBuilderImpl<
TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute,
ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD,
Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecialized<1,2,true>>;
using GmemStrideTypeC = gemm::TagToStrideC_t<GmemLayoutTagC>;
using GmemStrideTypeD = gemm::TagToStrideC_t<GmemLayoutTagD>;
public:
using CollectiveOp = typename Impl::CollectiveOp;
using EpilogueTile_MN = decltype(detail::sm90_compute_tile_shape_or_override<
ElementD, EpilogueTileType, Schedule>());
static constexpr int StagesC = 1;
static constexpr int StagesD = 2;
static constexpr bool DisableReuseSmemC = true;
using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue<
cutlass::epilogue::Sm90TmaWarpSpecialized<StagesC,StagesD,DisableReuseSmemC>,
TileShape_MNK,
EpilogueTile_MN,
ElementC_, // need to pass void to expose via GemmUniversal
GmemStrideTypeC,
ElementD,
GmemStrideTypeD,
ThreadOp,
SM90_TMA_LOAD,
decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeC, ElementC, TileShape_MNK>()),
decltype(detail::sm90_get_smem_load_op_for_source<GmemStrideTypeC, ElementC>()),
SM90_TMA_STORE,
decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeD, ElementD, EpilogueTile_MN>()),
decltype(detail::sm90_get_smem_store_op_for_accumulator<GmemStrideTypeD, ElementD>())
>;
};
// Auto builder
@ -409,7 +426,7 @@ struct CollectiveBuilder<
AlignmentD,
Schedule,
cute::enable_if_t<cute::is_base_of_v<TmaWarpSpecializedElementwiseBase, Schedule> ||
cute::is_base_of_v<TmaWarpSpecializedCooperativeElementwiseBase, Schedule> >> {
cute::is_base_of_v<TmaWarpSpecializedCooperativeElementwiseBase, Schedule> >> {
public:
using ThreadOp = thread::LinearCombinationGeneric<
@ -419,10 +436,13 @@ public:
Schedule::Round>;
private:
static constexpr int StagesC = 1;
static constexpr int StagesD = 2;
static constexpr bool DisableReuseSmemC = true;
using Impl = detail::TmaBuilderImpl<
TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute,
ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD,
Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecialized<1,2,true>>;
Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecialized<StagesC,StagesD,DisableReuseSmemC>>;
public:
using CollectiveOp = typename Impl::CollectiveOp;
@ -459,7 +479,7 @@ struct CollectiveBuilder<
AlignmentD,
Schedule,
cute::enable_if_t<cute::is_base_of_v<TmaWarpSpecializedBiasElementwiseBase, Schedule> ||
cute::is_base_of_v<TmaWarpSpecializedCooperativeBiasElementwiseBase, Schedule> >> {
cute::is_base_of_v<TmaWarpSpecializedCooperativeBiasElementwiseBase, Schedule> >> {
public:
using ThreadOp = thread::LinearCombinationBiasElementwise<
@ -468,10 +488,12 @@ public:
Schedule::StoreT, typename Schedule::ElementBias>;
private:
static constexpr int StagesC = 1;
static constexpr int StagesD = 2;
using Impl = detail::TmaBuilderImpl<
TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute,
ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD,
Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecializedBiasElementwise<1,2>>;
Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecializedBiasElementwise<StagesC,StagesD>>;
public:
using CollectiveOp = typename Impl::CollectiveOp;

View File

@ -82,7 +82,6 @@ public:
//
// Type Aliases
//
// derived types of output thread level operator
using DispatchPolicy = Sm90TmaWarpSpecialized<StagesC_,StagesD_,DisableSmemReuseC_>;
using BlockTileShape = BlockTileShape_;
using EpilogueTile = EpilogueTile_;
@ -108,7 +107,6 @@ public:
constexpr static bool iskThreadEpilogueOpWithBias = detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
using AlignmentType = typename uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
static_assert(sizeof(ElementC) == 2, "Only 16b source supported for now");
static_assert(sizeof(ElementD) == 2, "Only 16b output supported for now");
static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
static_assert(rank(BlockTileShape{}) == 3, "BlockTileShape must be rank-3: [BLK_M,BLK_N,BLK_K]");
@ -117,17 +115,19 @@ public:
static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
private:
using InternalElementC = std::conditional_t<std::is_void_v<ElementC>,ElementD,ElementC>; // prevents void ref breakages
constexpr static int StagesC = StagesC_;
constexpr static int StagesD = StagesD_;
constexpr static bool is_source_supported = ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default ||
ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::NoBetaScaling;
static_assert((std::is_void_v<ElementC> && not is_source_supported) || (not std::is_void_v<ElementC> && is_source_supported));
// internal optimization to reuse C shared memory for storing D
using SmemLayoutAtomBitsC = decltype(downcast<sizeof_bits<ElementC>::value>(SmemLayoutAtomC{}));
using SmemLayoutAtomBitsC = decltype(downcast<sizeof_bits<InternalElementC>::value>(SmemLayoutAtomC{}));
using SmemLayoutAtomBitsD = decltype(downcast<sizeof_bits<ElementD>::value>(SmemLayoutAtomD{}));
constexpr static bool ReuseSmemC = not DispatchPolicy::DisableSmemReuseC &&
is_source_supported &&
sizeof(ElementC) == sizeof(ElementD) &&
sizeof(InternalElementC) == sizeof(ElementD) &&
StrideC{} == StrideD{} &&
cute::is_same_v<SmemLayoutAtomBitsC,SmemLayoutAtomBitsD>;
@ -152,7 +152,7 @@ public:
using LoadPipeline = cutlass::PipelineTransactionAsync<is_source_supported ? StagesC : 0>;
using LoadPipelineState = cutlass::PipelineState<is_source_supported ? StagesC : 0>;
constexpr static uint32_t TmaTransactionBytes =
size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof(ElementC));
size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof(InternalElementC));
// TMA pipeline for storing D
using StorePipeline = cutlass::PipelineTmaStore<ReuseSmemC ? StagesC : StagesD>;
@ -161,8 +161,8 @@ public:
struct SharedStorage {
struct TensorStorage : aligned_struct<128> {
cute::conditional_t<not is_source_supported,
detail::EmptyStorage<ElementC>,
array_aligned<ElementC, size(SmemLayoutC{})>> smem_C;
detail::EmptyStorage<InternalElementC>,
array_aligned<InternalElementC, size(SmemLayoutC{})>> smem_C;
alignas(128) cute::conditional_t<ReuseSmemC,
detail::EmptyStorage<ElementD>,
array_aligned<ElementD, size(SmemLayoutD{})>> smem_D;
@ -187,7 +187,7 @@ public:
struct Params {
using TMA_C = decltype(make_tma_copy(
CopyOpG2S{},
make_tensor(static_cast<ElementC const*>(nullptr),
make_tensor(static_cast<InternalElementC const*>(nullptr),
repeat_like(StrideC{}, int32_t(0)), StrideC{}),
SmemLayoutC{}(_,_,0)));
using TMA_D = decltype(make_tma_copy(
@ -217,7 +217,7 @@ public:
auto M = get<0>(problem_shape_MNKL);
auto N = get<1>(problem_shape_MNKL);
auto L = get<3>(problem_shape_MNKL);
Tensor tensor_c = make_tensor(args.ptr_C, make_layout(make_shape(M,N,L), args.dC));
Tensor tensor_c = make_tensor(static_cast<InternalElementC const*>(args.ptr_C), make_layout(make_shape(M,N,L), args.dC));
Tensor tensor_d = make_tensor(args.ptr_D, make_layout(make_shape(M,N,L), args.dD));
typename Params::TMA_C tma_load_c = make_tma_copy(
CopyOpG2S{},
@ -409,7 +409,7 @@ public:
// Allocate register tensors
auto tRS_rD_shape = take<0,3>(shape(thread_r2s.partition_S(bEsD))); // (R2S,R2S_M,R2S_N)
Tensor tRS_rC = make_tensor<ElementC>(tRS_rD_shape); // (R2S,R2S_M,R2S_N)
Tensor tRS_rC = make_tensor<InternalElementC>(tRS_rD_shape); // (R2S,R2S_M,R2S_N)
Tensor tRS_rD = make_tensor<ElementD>(tRS_rD_shape); // (R2S,R2S_M,R2S_N)
// Vectorized fragment view for thread epilogue op
@ -418,7 +418,7 @@ public:
Tensor tRS_rD_frg = recast<typename ThreadEpilogueOp::FragmentOutput>(tRS_rD);
// Partition for smem to register copy (tSR_)
TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R,ElementC>{}, tiled_r2s);
TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R,InternalElementC>{}, tiled_r2s);
ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
Tensor tSR_sC = thread_s2r.partition_S(bEsC); // (S2R,S2R_M,S2R_N,EPI_M,EPI_N)
Tensor tSR_rC = thread_s2r.retile_D(tRS_rC); // (S2R,S2R_M,S2R_N)

View File

@ -130,6 +130,7 @@ public:
using ActivationFunctor = ActivationFunctor_<ElementCompute>;
static constexpr int kCount = 1;
static constexpr ScaleType::Kind kScale = Scale;
using FragmentOutput = Array<ElementOutput, kCount>;
using FragmentAccumulator = Array<ElementAccumulator, kCount>;

View File

@ -323,7 +323,7 @@ public:
OutputTileIterator destination_iterator, ///< Tile iterator for destination
OutputTileIterator source_iterator) ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
{
// Redcuce peer accumulator fragments into one fragment
// Reduce peer accumulator fragments into one fragment
AccumulatorFragment accum_fragment;
BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);

View File

@ -190,7 +190,8 @@ struct CollectiveMma<
"SmemLayoutB K must be 128bytes to be transposed.");
static_assert(!transform::collective::detail::use_universal_transposition<InternalSmemLayoutAtomB, InternalElementB>(),
"Warp specialized ARF kernels have not supported universal B transposition yet.");
static_assert(!TransposeB || shape<0>(TileShape{}) == 64, "Optimized transpose RS kernel requires TileShape M = 64.");
static_assert(!TransposeB || !cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
"Transpose RS kernel requires kernel schedule schmem is not KernelTmaWarpSpecializedCooperative.");
struct SharedStorage
{
@ -294,7 +295,7 @@ struct CollectiveMma<
static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
static constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
static_assert(K_PIPE_MMAS >= 1, "At least one MMA stage should be asynchronous for this mainloop.");
static_assert(K_PIPE_MMAS == 0, "no MMA stage should be asynchronous for this mainloop for now.");
static constexpr uint32_t TmaTransactionBytes =
(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof(InternalElementA)))+
(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof(InternalElementB)));
@ -368,21 +369,6 @@ struct CollectiveMma<
}
}
// Issue the prologue loads
int k_tile_prologue = min(k_tile_count, K_PIPE_MAX);
CUTLASS_PRAGMA_UNROLL
for (int count = 0; count < k_tile_prologue; ++count) {
pipeline.producer_acquire(smem_pipe_write);
using BarrierType = typename MainloopPipeline::ProducerBarrierType;
BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
int write_stage = smem_pipe_write.index();
copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
++k_tile_iter;
++smem_pipe_write;
}
k_tile_count -= k_tile_prologue;
// Mainloop
CUTLASS_PRAGMA_NO_UNROLL
for ( ; k_tile_count > 0; --k_tile_count) {

View File

@ -303,22 +303,6 @@ struct CollectiveMma<
}
}
// Issue the prologue loads
int k_tile_prologue = min(k_tile_count, K_PIPE_MAX);
CUTLASS_PRAGMA_UNROLL
for (int count = 0; count < k_tile_prologue; ++count) {
pipeline.producer_acquire(smem_pipe_write);
using BarrierType = typename MainloopPipeline::ProducerBarrierType;
BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
int write_stage = smem_pipe_write.index();
copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
++k_tile_iter;
++smem_pipe_write;
}
k_tile_count -= k_tile_prologue;
// Mainloop
CUTLASS_PRAGMA_NO_UNROLL
for ( ; k_tile_count > 0; --k_tile_count)

View File

@ -301,18 +301,19 @@ public:
return 0;
}
result = cudaGetDeviceProperties(&properties, device_idx);
int multiprocessor_count;
result = cudaDeviceGetAttribute(&multiprocessor_count,
cudaDevAttrMultiProcessorCount, device_idx);
if (result != cudaSuccess) {
// Call cudaGetLastError() to clear the error bit
result = cudaGetLastError();
CUTLASS_TRACE_HOST(" cudaGetDeviceProperties() returned error "
<< cudaGetErrorString(result));
CUTLASS_TRACE_HOST(
" cudaDeviceGetAttribute() returned error "
<< cudaGetErrorString(result));
return 0;
}
bool override_sm_count = (available_sm_count < 0 || available_sm_count > properties.multiProcessorCount);
bool override_sm_count = (available_sm_count < 0 || available_sm_count > multiprocessor_count);
if (override_sm_count) {
available_sm_count = properties.multiProcessorCount;
available_sm_count = multiprocessor_count;
}
int max_active_blocks = maximum_active_blocks();
@ -440,8 +441,6 @@ public:
cudaError_t result = cudaGetLastError();
if (result != cudaSuccess) {
// Call cudaGetLastError() to clear the error bit
result = cudaGetLastError();
CUTLASS_TRACE_HOST(" grid launch failed with error " << cudaGetErrorString(result));
return Status::kErrorInternal;
}

View File

@ -490,9 +490,9 @@ struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm80, double,
static int const kAlignmentA = 1;
static int const kAlignmentB = 1;
using ThreadblockShape = GemmShape<128, 256, 64>;
using WarpShape = GemmShape<64, 64, 64>;
using InstructionShape = GemmShape<16, 8, 16>;
using ThreadblockShape = GemmShape<128, 128, 16>;
using WarpShape = GemmShape<32, 64, 16>;
using InstructionShape = GemmShape<8, 8, 4>;
static int const kStages = 3;
using EpilogueOutputOp = epilogue::thread::LinearCombination<

View File

@ -31,7 +31,7 @@
/*! \file
\brief Template for a GEMM kernel that can broadcast bias vector in the
epigloue.
epilogue.
*/
#pragma once

View File

@ -0,0 +1,167 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h"
#include "cutlass/arch/arch.h"
#include "cutlass/device_kernel.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
#include "cutlass/gemm/kernel/gemm_universal.h"
#include "cutlass/gemm/kernel/default_gemm_universal.h"
#include "cutlass/gemm/device/default_gemm_configuration.h"
#include "cutlass/gemm/device/gemm_universal_base.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace gemm {
namespace device {
/////////////////////////////////////////////////////////////////////////////////////////////////
template <typename GemvKernel_>
class GemvStridedBatched {
public:
using GemvKernel = GemvKernel_;
using ElementA = typename GemvKernel::ElementA;
using LayoutA = typename GemvKernel::LayoutA;
using ElementB = typename GemvKernel::ElementB;
using ElementC = typename GemvKernel::ElementC;
using ElementAccumulator = typename GemvKernel::ElementAccumulator;
using EpilogueOutputOp = typename GemvKernel::EpilogueOutputOp;
static ComplexTransform const kTransformA = GemvKernel::kTransformA;
static ComplexTransform const kTransformB = GemvKernel::kTransformB;
static int const kThreadCount = GemvKernel::kThreadCount;
static int const mThreadCount = GemvKernel::mThreadCount;
static int const kStages = GemvKernel::kStages;
static int const kAlignmentA = GemvKernel::kAlignmentA;
static int const kAlignmentB = GemvKernel::kAlignmentB;
static int const kAlignmentC = GemvKernel::kAlignmentC;
using Arguments = typename GemvKernel::Arguments;
using Params = typename GemvKernel::Params;
private:
Params params_;
public:
/// Constructs the Gemv.
GemvStridedBatched() {}
/// Determines whether the Gemv can execute the given problem.
static Status can_implement(Arguments const& args) {
return GemvKernel::can_implement(args);
}
/// Gets the workspace size
static size_t get_workspace_size(Arguments const& args) { return 0; }
/// Initializes Gemv state from arguments.
Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
params_ = Params(args);
if (args.problem_size.column() % GemvKernel::kElementsPerAccess) {
return Status::kErrorMisalignedOperand;
}
return Status::kSuccess;
}
/// Lightweight update given a subset of arguments
Status update(Arguments const &args, void *workspace = nullptr) {
return params_.update(args);
}
/// Runs the kernel using initialized state.
Status run(cudaStream_t stream = nullptr) {
dim3 grid(1, 1, params_.batch_count % 65536);
dim3 block(kThreadCount, mThreadCount, 1);
int smem_size = 0;
// Launch
cutlass::Kernel<GemvKernel><<<grid, block, smem_size, stream>>>(params_);
//
// Query for errors
//
cudaError_t result = cudaGetLastError();
return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
}
/// Runs the kernel using initialized state.
Status operator()(cudaStream_t stream = nullptr) { return run(stream); }
/// Runs the kernel using initialized state.
Status operator()(
Arguments const &args,
void *workspace = nullptr,
cudaStream_t stream = nullptr) {
Status status = initialize(args, workspace, stream);
if (status == Status::kSuccess) {
status = run(stream);
}
return status;
}
};
////////////////////////////////////////////////////////////////////////////////
} // namespace device
} // namespace gemm
} // namespace cutlass
////////////////////////////////////////////////////////////////////////////////

View File

@ -150,7 +150,7 @@ template<
int Stages_,
class ClusterShape_ = Shape<_1,_1,_1>,
class KernelSchedule = KernelTmaWarpSpecialized,
int PipelineAsyncMmaStages_ = 1
int PipelineAsyncMmaStages_ = 0
>
struct MainloopSm90TmaGmmaRmemAWarpSpecialized {
constexpr static int Stages = Stages_;

View File

@ -0,0 +1,368 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief
*/
#pragma once
#include "cutlass/cutlass.h"
#include "cutlass/fast_math.h"
#include "cutlass/matrix_coord.h"
#include "cutlass/complex.h"
#include "cutlass/tensor_ref.h"
#include "cutlass/arch/memory.h"
#include "cutlass/arch/cache_operation.h"
#include "cutlass/gemm/gemm.h"
#include "cutlass/layout/matrix.h"
#include "cutlass/numeric_conversion.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace gemm {
namespace kernel {
/////////////////////////////////////////////////////////////////////////////////////////////////
template <
typename ElementA_, /// matrix
typename LayoutA_,
typename ElementB_, /// vector
typename ElementC_,
typename ElementAccumulator_,
int kElementsPerAccess_,
typename EpilogueOutputOp_
>
struct GemvStridedBatched {
public:
using ElementA = ElementA_;
using LayoutA = layout::RowMajor;
using TensorRefA = TensorRef<ElementA, LayoutA>;
static_assert(std::is_same<LayoutA, LayoutA_>::value,
"Only supported for row-major A matrix");
using ElementB = ElementB_;
using ElementC = ElementC_;
using ElementAccumulator = ElementAccumulator_;
using EpilogueOutputOp = EpilogueOutputOp_;
static ComplexTransform const kTransformA = ComplexTransform::kNone;
static ComplexTransform const kTransformB = ComplexTransform::kNone;
static FloatRoundStyle const Round = cutlass::FloatRoundStyle::round_to_nearest;
// number of return elements in a global access
static int const kElementsPerAccess = kElementsPerAccess_;
using FragmentA = Array<ElementA, kElementsPerAccess>;
using FragmentB = Array<ElementB, kElementsPerAccess>;
using FragmentCompute = Array<ElementAccumulator, kElementsPerAccess>;
// thread block shape (kThreadCount, mThreadCount)
static int const kThreadCount = std::min(static_cast<int>(128 / (kElementsPerAccess * sizeof(ElementA))), 16);
static int const mThreadCount = 128 / kThreadCount;
// rolling tile shape
static int const kTileA = kThreadCount * kElementsPerAccess;
static int const mTileA = mThreadCount * 8;
//
// Structures
//
/// Argument structure
struct Arguments
{
MatrixCoord problem_size;
int32_t batch_count;
typename EpilogueOutputOp::Params output_op;
TensorRefA ref_A;
ElementB const *ptr_B;
ElementC const *ptr_C;
ElementC *ptr_D;
int64_t batch_stride_A;
int64_t batch_stride_B;
int64_t batch_stride_C;
int64_t batch_stride_D;
//
// Methods
//
Arguments() : batch_count(0) {}
Arguments(
MatrixCoord problem_size,
int32_t batch_count,
typename EpilogueOutputOp::Params output_op,
TensorRefA ref_A,
void const *ptr_B,
void const *ptr_C,
void *ptr_D,
int64_t batch_stride_A,
int64_t batch_stride_B,
int64_t batch_stride_C,
int64_t batch_stride_D) : problem_size(problem_size),
batch_count(batch_count),
output_op(output_op),
ref_A(ref_A),
ptr_B(static_cast<ElementB const *>(ptr_B)),
ptr_C(static_cast<ElementC const *>(ptr_C)),
ptr_D(static_cast<ElementC *>(ptr_D)),
batch_stride_A(batch_stride_A),
batch_stride_B(batch_stride_B),
batch_stride_C(batch_stride_C),
batch_stride_D(batch_stride_D)
{
}
Arguments(
MatrixCoord problem_size,
typename EpilogueOutputOp::Params output_op,
TensorRefA ref_A,
void const *ptr_B,
void const *ptr_C,
void *ptr_D) : Arguments(problem_size,
1,
1,
output_op,
ref_A,
ptr_B,
ptr_C,
ptr_D,
1,
1,
1,
1)
{
}
Status update(Arguments const &args)
{
problem_size = args.problem_size;
batch_count = args.batch_count;
output_op = args.output_op;
ref_A = ref_A;
ptr_B = args.ptr_B;
ptr_C = args.ptr_C;
ptr_D = args.ptr_D;
batch_stride_A = args.batch_stride_A;
batch_stride_B = args.batch_stride_B;
batch_stride_C = args.batch_stride_C;
batch_stride_D = args.batch_stride_D;
return Status::kSuccess;
}
};
using Params = Arguments;
/// Shared memory storage structure
union SharedStorage
{
};
public:
//
// Methods
//
CUTLASS_DEVICE
GemvStridedBatched() {}
/// Determines whether kernel satisfies alignment
static Status can_implement(cutlass::MatrixCoord const &problem_size)
{
if (problem_size.column() % kElementsPerAccess != 0)
return Status::kErrorMisalignedOperand;
return Status::kSuccess;
}
static Status can_implement(Arguments const &args)
{
return can_implement(args.problem_size);
}
/// Executes one GEMV
CUTLASS_DEVICE
void operator()(Params const &params, SharedStorage &shared_storage)
{
// Loop over batch indices
for (int batch_idx = blockIdx.z; batch_idx < params.batch_count; batch_idx += gridDim.z)
{
int k_col_id = threadIdx.x;
int m_row_id = threadIdx.y;
// problem_size (row = m, column = k)
// matrix A (batch, m, k)
// vector B (batch, 1, k)
// vector C (batch, m, 1)
// vector D (batch, m, 1)
// move in the batch dimension
ElementA const *ptr_A = params.ref_A.data() + batch_idx * params.batch_stride_A;
ElementB const *ptr_B = params.ptr_B + batch_idx * params.batch_stride_B;
ElementC const *ptr_C = params.ptr_C + batch_idx * params.batch_stride_C;
ElementC *ptr_D = params.ptr_D + batch_idx * params.batch_stride_D;
// move in the k dimension
ptr_A += k_col_id * kElementsPerAccess;
ptr_B += k_col_id * kElementsPerAccess;
// move in the m dimension
ptr_A += m_row_id * params.problem_size.column();
ptr_C += m_row_id;
ptr_D += m_row_id;
NumericArrayConverter<ElementAccumulator, ElementA, kElementsPerAccess, Round> srcA_converter;
NumericArrayConverter<ElementAccumulator, ElementB, kElementsPerAccess, Round> srcB_converter;
for (; m_row_id < params.problem_size.row(); m_row_id += mTileA)
{
ElementAccumulator accum[mTileA / mThreadCount] = {0.f};
FragmentB fragB;
FragmentA fragA[mTileA / mThreadCount];
int mElemCountPerTile = min(mTileA / mThreadCount, (params.problem_size.row() - m_row_id - 1) / mThreadCount + 1);
int kUnroll = 0;
for (; kUnroll < params.problem_size.column() / kTileA * kTileA; kUnroll += kTileA)
{
for (int m = 0; m < mElemCountPerTile; m++)
{
// fetch from matrix A
arch::global_load<FragmentA,
sizeof(FragmentA),
arch::CacheOperation::LastUse>(fragA[m], (ptr_A + kUnroll + m * mThreadCount * params.problem_size.column()), true);
}
// fetch from vector B
arch::global_load<FragmentB,
sizeof(FragmentB),
arch::CacheOperation::Always>(fragB, (ptr_B + kUnroll), true);
for (int m = 0; m < mElemCountPerTile; m++)
{
FragmentCompute fragB_Compute = srcB_converter(fragB);
FragmentCompute fragA_Compute = srcA_converter(fragA[m]);
// Math
CUTLASS_PRAGMA_UNROLL
for (int e = 0; e < kElementsPerAccess; e++)
{
accum[m] += fragA_Compute.at(e) * fragB_Compute.at(e);
}
}
}
// calculate the rest of K elements
// each thread fetch 1 element each time
for (int k = kUnroll + k_col_id; k < params.problem_size.column(); k += kThreadCount)
{
ElementB b = *(ptr_B - k_col_id * kElementsPerAccess + k);
for (int m = 0; m < mElemCountPerTile; m++)
{
ElementA a = *(ptr_A - k_col_id * kElementsPerAccess + k + m * mThreadCount * params.problem_size.column());
accum[m] += ElementAccumulator(a) * ElementAccumulator(b);
}
}
EpilogueOutputOp output_op(params.output_op);
typename EpilogueOutputOp::FragmentOutput source_fragment[mTileA / mThreadCount];
// prefetch from source matrix C
if (output_op.is_source_needed())
{
for (int m = 0; m < mElemCountPerTile; m++)
{
source_fragment[m][0] = *(ptr_C + m * mThreadCount);
}
}
typename EpilogueOutputOp::FragmentAccumulator accum_fragment;
typename EpilogueOutputOp::FragmentOutput output_fragment;
for (int m = 0; m < mElemCountPerTile; m++)
{
for (int mask = (kThreadCount >> 1); mask > 0; mask >>= 1)
{
accum[m] += __shfl_xor_sync(0xFFFFFFFF, accum[m], mask, 32);
}
if (k_col_id == 0)
{
accum_fragment[0] = accum[m];
if (output_op.is_source_needed())
{
output_fragment = output_op(accum_fragment, source_fragment[m]);
}
else
{
output_fragment = output_op(accum_fragment);
}
*(ptr_D + m * mThreadCount) = output_fragment[0];
}
}
ptr_A += mTileA * params.problem_size.column();
ptr_C += mTileA;
ptr_D += mTileA;
}
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace kernel
} // namespace gemm
} // namespace cutlass
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -166,8 +166,8 @@ public:
CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n");
return implementable;
}
static constexpr int tma_alignment_bits = 128;
static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
constexpr int tma_alignment_bits = 128;
constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
auto M = get<0>(args.problem_shape);
auto N = get<1>(args.problem_shape);
auto K = get<2>(args.problem_shape);
@ -182,7 +182,17 @@ public:
N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0));
if (!implementable) {
CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
return implementable;
}
constexpr bool is_beta_supported =
CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default;
implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr);
if (!implementable) {
CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n");
return implementable;
}
return implementable;
}

View File

@ -173,8 +173,8 @@ public:
CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n");
return implementable;
}
static constexpr int tma_alignment_bits = 128;
static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
constexpr int tma_alignment_bits = 128;
constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
auto M = get<0>(args.problem_shape);
auto N = get<1>(args.problem_shape);
auto K = get<2>(args.problem_shape);
@ -189,7 +189,17 @@ public:
N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0));
if (!implementable) {
CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
return implementable;
}
constexpr bool is_beta_supported =
CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default;
implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr);
if (!implementable) {
CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n");
return implementable;
}
return implementable;
}

View File

@ -196,8 +196,8 @@ public:
CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n");
return implementable;
}
static constexpr int tma_alignment_bits = 128;
static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
constexpr int tma_alignment_bits = 128;
constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
auto M = get<0>(args.problem_shape);
auto N = get<1>(args.problem_shape);
auto K = get<2>(args.problem_shape);
@ -212,7 +212,17 @@ public:
N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0));
if (!implementable) {
CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
return implementable;
}
constexpr bool is_beta_supported =
CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default;
implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr);
if (!implementable) {
CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n");
return implementable;
}
return implementable;
}

View File

@ -204,8 +204,8 @@ public:
CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n");
return implementable;
}
static constexpr int tma_alignment_bits = 128;
static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
constexpr int tma_alignment_bits = 128;
constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
auto M = get<0>(args.problem_shape);
auto N = get<1>(args.problem_shape);
auto K = get<2>(args.problem_shape);
@ -220,7 +220,17 @@ public:
N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0));
if (!implementable) {
CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
return implementable;
}
constexpr bool is_beta_supported =
CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default;
implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr);
if (!implementable) {
CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n");
return implementable;
}
return implementable;
}

View File

@ -163,6 +163,12 @@ public:
int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
int const max_blk_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % size(cluster_shape));
int blk_per_device = min_num_gpc * max_blk_occupancy_per_gpc;
// The calculation below allows for larger grid size launch for different GPUs.
int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
int const max_blk_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % size(cluster_shape));
blk_per_device += max_blk_occupancy_per_residual_gpc;
blk_per_device = sm_count < blk_per_device ? sm_count : blk_per_device;
launch_grid.x = std::min(

View File

@ -630,9 +630,6 @@ struct ThreadblockSwizzleStreamK {
}
// Guards needed for PyCUTLASS library generation
#if !defined(CUTLASS_PYTHON_HOST_CC)
//
// Device-side interface
//
@ -692,7 +689,7 @@ struct ThreadblockSwizzleStreamK {
return GemmCoord(m, n, get_batch_idx());
}
/// Obtains the calling threadblock's tiled coordinates for the given tile index (row-major rastorization)
/// Obtains the calling threadblock's tiled coordinates for the given tile index (row-major rasterization)
CUTLASS_DEVICE
GemmCoord get_tile_offset_row_major(int tile_idx) const
{
@ -740,7 +737,7 @@ struct ThreadblockSwizzleStreamK {
div_mod_sk_iters_per_region(region_idx, iter_in_region, iter);
int big_block_iters = (sk_big_blocks_per_region * sk_iters_per_normal_block()) + sk_big_blocks_per_region; // number of iterations in the region's big blocks
int normal_block_iters = iter_in_region - big_block_iters; // number of iterations in the region's normal bocks
int normal_block_iters = iter_in_region - big_block_iters; // number of iterations in the region's normal blocks
int big_block_idx_in_region = div_mod_sk_iters_per_big_block.div(iter_in_region);
int normal_block_idx_in_region = sk_big_blocks_per_region + div_mod_sk_iters_per_normal_block.div(normal_block_iters);
@ -794,8 +791,6 @@ struct ThreadblockSwizzleStreamK {
return get_sk_block_idx(iter);
}
#endif // !defined(CUTLASS_PYTHON_HOST_CC)
};
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -450,7 +450,7 @@ private :
CUTLASS_DEVICE
void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
if (barrier_token == BarrierStatus::WaitAgain) {
consumer_wait(stage, phase);
full_barrier_ptr_[stage].wait(phase);
}
}
@ -654,7 +654,7 @@ public :
consumer_release(state.index());
}
protected:
private:
FullBarrier *full_barrier_ptr_ = nullptr;
EmptyBarrier *empty_barrier_ptr_ = nullptr;
Params params_;
@ -976,6 +976,11 @@ public:
++stage_;
}
CUTLASS_DEVICE
void advance() {
++stage_;
}
private:
CUTLASS_DEVICE

View File

@ -89,19 +89,16 @@ public:
/// Waits until the semaphore is equal to the given value
CUTLASS_DEVICE
void wait(int status = 0) {
#if !defined(CUTLASS_PYTHON_HOST_CC)
while( __syncthreads_and(state != status) ) {
fetch();
}
__syncthreads();
#endif
}
/// Updates the lock with the given result
CUTLASS_DEVICE
void release(int status = 0) {
#if !defined(CUTLASS_PYTHON_HOST_CC)
__syncthreads();
if (wait_thread) {
@ -111,7 +108,6 @@ public:
asm volatile ("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
#endif
}
#endif
}
};

View File

@ -3,8 +3,8 @@
## Layout
This document describes `Layout`, CuTe's core abstraction.
A `Layout` maps from (a) logical coordinate space(s)
to a physical index space.
A `Layout` maps from a logical coordinate space
to an index space.
`Layout`s present a common interface to multidimensional array access
that abstracts away the details of how the array's elements are organized in memory.
@ -19,7 +19,11 @@ This can help users do things like partition layouts of data over layouts of thr
## Layouts and Tensors
Any of the `Layout`s discussed in this section can be composed with data -- a pointer or an array -- to create a `Tensor`. The responsibility of the `Layout` is to define valid coordinate space(s) and, therefore, the logical shape of the data and map those into an index space. The index space is precisely the offset that would be used to index into the array of data.
Any of the `Layout`s discussed in this section can be composed with data -- e.g., a pointer or an array -- to create a `Tensor`.
The `Layout`'s logical coordinate space represents the logical "shape" of the data,
e.g., the modes of the `Tensor` and their extents.
The `Layout` maps a logical coordinate into an index,
which is an offset to be used to index into the array of data.
For details on `Tensor`, please refer to the
[`Tensor` section of the tutorial](./03_tensor.md).
@ -31,31 +35,31 @@ Both `Shape` and `Stride` are `IntTuple` types.
### IntTuple
An `IntTuple` is an integer or a tuple of `IntTuple`s.
An `IntTuple` is defined recursively as either a single integer, or a tuple of `IntTuple`s.
This means that `IntTuple`s can be arbitrarily nested.
Operations defined on `IntTuple`s include the following.
* `get<I>(IntTuple)`: The `I`th element of the `IntTuple`. Note that `get<0>` is defined for integer `IntTuples`.
* `get<I>(IntTuple)`: The `I`th element of the `IntTuple`. For an `IntTuple` consisting of a single integer, `get<0>` is just that integer.
* `rank(IntTuple)`: The number of elements in an `IntTuple`. An int has rank 1, a tuple has rank `tuple_size`.
* `rank(IntTuple)`: The number of elements in an `IntTuple`. A single integer has rank 1, and a tuple has rank `tuple_size`.
* `depth(IntTuple)`: The number of hierarchical `IntTuple`s. An int has depth 0, a tuple has depth 1, a tuple that contains a tuple has depth 2, etc.
* `depth(IntTuple)`: The number of hierarchical `IntTuple`s. A single integer has depth 0, a tuple of integers has depth 1, a tuple that contains a tuple of integers has depth 2, etc.
* `size(IntTuple)`: The product of all elements of the IntTuple.
* `size(IntTuple)`: The product of all elements of the `IntTuple`.
We write `IntTuple`s with parenthesis to denote the hierarchy. E.g. `6`, `(2)`, `(4,3)`, `(3,(6,2),8)` are all `IntTuple`s.
We write `IntTuple`s with parenthesis to denote the hierarchy. For example, `6`, `(2)`, `(4,3)`, `(3,(6,2),8)` are all `IntTuple`s.
## Layout
A `Layout` is then a pair of `IntTuple`s. The first defines the abstract *shape* of the layout and the second defines the *strides*, which map from coordinates within the shape to the index space.
A `Layout` is then a pair of `IntTuple`s. The first element defines the abstract *shape* of the `Layout`, and the second element defines the *strides*, which map from coordinates within the shape to the index space.
As a pair of `IntTuple`s, we can define many similar operations on `Layout`s including
Since a `Layout` is just a pair of `IntTuple`s, we can define operations on `Layout`s analogous to those defined on `IntTuple`.
* `get<I>(Layout)`: The `I`th sub-layout of the `Layout`.
* `rank(Layout)`: The number of modes in a `Layout`.
* `depth(Layout)`: The number of hierarchical `Layout`s. An int has depth 0, a tuple has depth 1, a tuple that contains a tuple has depth 2, etc.
* `depth(Layout)`: The number of hierarchical `Layout`s. A single integer has depth 0, a tuple of integers has depth 1, a tuple that contains a tuple of integers has depth 2, etc.
* `shape(Layout)`: The shape of the `Layout`.
@ -86,7 +90,7 @@ These hierarchical access functions include the following.
### Vector examples
Then, we can define a vector as any `Shape` and `Stride` pair with `rank == 1`.
We define a vector as any `Shape` and `Stride` pair with `rank == 1`.
For example, the `Layout`
```
@ -95,9 +99,9 @@ Stride: (1)
```
defines a contiguous 8-element vector.
Similarly, with a stride of `(2)`,
For a vector with the same Shape but a Stride of `(2)`,
the interpretation is that the eight elements
are stored at positions 0, 2, 4, $\dots$.
are stored at positions 0, 2, 4, $\dots$, 14.
By the above definition, we *also* interpret
@ -168,9 +172,17 @@ auto layout_2x4 = make_layout(make_shape (2, make_shape (2,2)),
make_stride(4, make_stride(2,1)));
```
The `make_layout` function returns a `Layout`.
It deduces the returned `Layout`'s template arguments from the function's arguments.
Similarly, the `make_shape` and `make_stride` functions
return a `Shape` resp. `Stride`.
CuTe often uses these `make_*` functions,
because constructor template argument deduction (CTAD)
does not work for `cute::tuple` as it works for `std::tuple`.
## Using a `Layout`
The fundamental use of a `Layout` is to map between logical coordinate space(s) and index space. For example, to print an arbitrary rank-2 layout, we can write the function
The fundamental use of a `Layout` is to map between logical coordinate space(s) and an index space. For example, to print an arbitrary rank-2 layout, we can write the function
```c++
template <class Shape, class Stride>

View File

@ -73,6 +73,7 @@ In C++, we identify a Tuple with the
`cute::tuple` behaves like `std::tuple`, but it works on device or host,
and it imposes restrictions on its template arguments for performance and simplicity.
#### IntTuple
CuTe then defines an IntTuple as either an integer, or a Tuple of IntTuple.
@ -136,7 +137,7 @@ This code produces the following text output.
```
`print(layout(1, 1))` prints the mapping of
the logical 2-D coordinate (1,1) to 1-D index, which is 4.
the logical 2-D coordinate (1,1) to the 1-D index, which is 4.
You can see that from the table,
which shows the left logical index as the "row,"
and the right logical index as the "column."
@ -302,13 +303,13 @@ Both humans and CuTe compute composition using the following rules.
2. Concatenation: A layout can be expressed as the concatenation of its sublayouts. We denote concatenation with parentheses: $B = (B_0,B_1,...)$. The CuTe function `make_layout`, when given zero or more `Layout`s, concatenates them.
3. Composition is (left-)distributive with concatenation: $A \circ B = A \circ (B0, B1, ...) = (A \circ B0, A \circ B1, ...)$.
3. Composition is (left-)distributive with concatenation: $A \circ B = A \circ (B_0, B_1, ...) = (A \circ B_0, A \circ B_1, ...)$.
4. "Base case": For layouts $A = a : b$ and $B = c : d$ with integral shape and stride, $A \circ B = R = c : (b * d)$.
5. By-mode composition: Let $\langle B, C \rangle$ (angle brackets, not parentheses)
denote a tuple of two layouts B and C, not their concatenation. Let A = (A0, A1).
Then, $A \circ \langle B, C \rangle = (A0, A1) \circ \langle B, C \rangle = (A0 \circ B, A1 \circ C)$.
denote a tuple of two layouts B and C, not their concatenation. Let $A = (A_0, A_1)$.
Then, $A \circ \langle B, C \rangle = (A_0, A_1) \circ \langle B, C \rangle = (A_0 \circ B, A_1 \circ C)$.
This allows the application of composition independently to sublayouts of $A$.
#### Examples: Reshape a vector into a matrix
@ -359,6 +360,55 @@ The resulting layout has shape $(4,5)$, just as before. What are the strides?
5. Result: (4:10, 5:2), which by concatenation is (4,5) : (10,2).
#### Example: Reshape a matrix into another matrix
The composition $((20,2):(16,4) \circ (4,5):(1,4))$
expresses reshaping the matrix with layout (20,2):(16:4),
into a 4 x 5 matrix in a column-major way.
1. By deconcatenation, $(4,5) : (1,4)$ is $(4:1, 5:4)$.
2. Composition is distributive, so $(20,2):(16,4) \circ (4:1, 5:4)$ is $((20,2):(16,4) \circ 4:1, (20,2):(16,4) \circ 5:4)$.
3. $(20,2):(16,4) \circ 4:1$ has shape $4$ and stride $16$. (4:1 expresses picking the first 4 consecutive elements of (20,2):(16,4). These elements run down the 0th column (leftmost mode) of the layout, whose stride is 16.)
4. $(20,2):(16,4) \circ 5:4$ has shape $5$ and stride $64 = 4 \cdot 16$.
5. Result: $(4:16, 5:64)$, which by concatenation is $(4,5) : (16,64)$.
We get exactly this result with CuTe
if we use compile-time shapes and strides.
The following C++ code prints `(_4,_5):(_16,_64).`
```c++
using namespace cute;
auto a = make_layout(make_shape(Int<20>{}, _2{}), make_stride(_16{}, _4{}));
auto b = make_layout(make_shape( _4{}, _5{}), make_stride( _1{}, _4{}));
auto c = composition(a, b);
printf("\n");
print(c);
```
Results may _look_ different (but are the same mathematically)
if we use run-time integers.
The following C++ code prints `((4,1),(5,1)):((16,4),(64,4)).`
```c++
using namespace cute;
auto a = make_layout(make_shape(20, 2), make_stride(16, 4));
auto b = make_layout(make_shape( 4, 5), make_stride( 1, 4));
auto c = composition(a, b);
printf("\n");
print(c);
```
((4,1),(5,1)):((16,4),(64,4)) is effectively the same layout
as (4,5) : (16,64), because the 1s in the shape don't affect the layout
(as a mathematical function from one integer to one integer).
CuTe chooses not to simplify layout computations
with run-time values in them as much as it could,
because simplifications involving run-time values have a run-time cost.
### Product
CuTe includes four different kinds of layout products.
@ -428,7 +478,7 @@ results in Shape ((2, 2), (3, 4)) and Stride ((1, 2), (16, 4)).
| (1,1) | 3 | 19 | 35 | 7 | 23 | 39 | 11 | 27 | 43 | 15 | 31 | 47 |
Note how the tile appears in the leftmost column and is reproduced
in each column in the same order as the matrix-of-tiles. That is,
in each column in the same order as the matrix-of-tiles. That is,
the tile can be indexed through the first mode of the result and the
matrix-of-tiles can be indexed through the second mode.
@ -456,8 +506,8 @@ Shape ((3, 2), (4, 2)) and Stride ((16, 1), (4, 2)).
| (2,1) | 33 | 37 | 41 | 45 | 35 | 39 | 43 | 47 |
The tile is now interleaved or "raked" with the other 3x4 matrix-of-tiles
instead of appearing as blocks. Other references call this cyclic
distribution.
instead of appearing as blocks. Other references call this a "cyclic
distribution."
This might look familiar if you have ever used ScaLAPACK.
It expresses a 2-D block cyclic distribution of a 6 x 8 matrix
@ -542,7 +592,87 @@ CuTe includes 3 different kinds of layout division operations.
We will summarize these in the sections that follow.
#### Logical divide : the intuitive tiling
#### Logical divide
##### Example worked in detail
This section will work the following logical divide example in detail.
```c++
Layout a = make_layout(24, 2);
Layout b = make_layout( 4, 2);
Layout c = logical_divide(a, b);
```
Logical divide produces a rank-2 `Layout`,
where mode 0 (the leftmost mode) corresponds to the divisor `b`,
and mode 1 (the rightmost mode) corresponds to the "remainder."
Intuitively, the remainder of 24 divided by 4 is 6,
so we know that mode 1 has 6 elements.
We just don't know its shape yet.
CuTe defines `logical_divide(a, b)` as
`composition(a, make_layout(b, complement(b, size(a))))`.
Here, `size(a)` is 24.
What is `complement(b, 24)`?
Intuitively, it means "the remainder,"
what's left over after applying `b` to 0, 1, 2, $\dots$, 23.
The layout 4:2 means "take 4 elements at even-numbered indices."
The following table overlays the range of 4:2
atop the complement's codomain 0, 1, $\dots$, 23.
| Range of 4:2 | 0 | | 2 | | 4 | | 6 | | | | | |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| Codomain | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | $\dots$ | 23 |
Layouts are linear, so their range must include zero.
The complement of 4:2 with respect to 24 is thus a layout whose range
* includes zero;
* does not include any other elements of the range of 4:2
(i.e., satisfies the disjoint property; see above); and
* includes as much of 0, 1, $\dots$, 23 as possible
(so that it forms the "remainder" of 4:2 with respect to 24).
Intuitively, the range of the complement must look like this:
0, 1, 8, 9, 16, 17.
The resulting layout is ordered.
It has size 6 and cosize 18,
so it satisfies the bounded property (see above).
This is the layout (2, 3) : (1, 8).
(Going from this intuitive sense of the complement
to knowing how to compute it directly
is out of scope for this part of the tutorial.)
The following table shows 4:2 with its complement (2, 3) : (1, 8).
| Range of 4:2 | 0 | | 2 | | 4 | | 6 | | | | | | | | | | | | | |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| Codomain | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | $\dots$ | 23 |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| Range of complement | 0 | 1 | | | | | | | 8 | 9 | | | | | | | 16 | 17 | | |
Now we know that `logical_divide`(24:2, 4:2) is
`composition`(24:2, `make_layout`(4:2, (2,3):(1,8))).
The composition of two layouts has the shape of the second (rightmost) layout,
so the resulting shape is (4, (2, 3)).
We see that the leftmost mode 4 corresponds to the divisor 4:2,
and the rightmost mode (2, 3) describes what's "left over"
from the original shape 24.
What are the strides?
We can start from the leftmost mode.
4:2 takes every other element (the even-numbered elements) of 24:2.
That's a stride-2 thing, striding over a stride-2 thing.
The resulting stride is 4.
Similarly, the stride 2 of 24:2
doubles the two strides of the rightmost mode.
The resulting layout is (4, (2, 3)) : (4, (2, 16)).
##### Tiling example
Suppose I have the 6 x 8 matrix from the Raked Product section
and want to "collect" the `tile`, turning the Raked Product into
@ -607,7 +737,7 @@ Note that this is the same layout as the result in the Logical Product section.
That is, the first mode is our original tile (and can be interpreted as a 2x2 matrix itself)
and the second mode is its logical layout within the raked layout.
##### More Examples of Divide
#### More Examples of Divide
For brevity, shapes can be used with `logical_divide` and `tiled_divide` to quickly split and tile modes of a tensor. For example, this C++ code

View File

@ -200,7 +200,7 @@ class ArtifactManager:
self.compiled_cache_host.insert(key, compiled_host_fns)
return True
def emit_compile_(self, operation_list, compilation_options, requires_nvcc_hostlib_compilation):
def emit_compile_(self, operation_list, compilation_options):
"""
Compile a list of kernels and store them into database
"""
@ -306,41 +306,17 @@ class ArtifactManager:
cubin_image = file.read()
# Set up the host-side library code
if requires_nvcc_hostlib_compilation:
cmd_template = (
"echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
% source_buffer_host
)
cmd = SubstituteTemplate(
cmd_template,
{
"cuda_install_path": CUDA_INSTALL_PATH,
"options": compilation_options.get_str(),
},
)
else:
options = compilation_options.get()
cmd = (
"echo '%s'|g++ -x c++ -fpermissive -w -fPIC -DCUTLASS_PYTHON_HOST_CC=1"
% source_buffer_host
)
filtered_opts = [
"-default-device",
"-Xcicc",
"-Xllc",
"--expt-relaxed-constexpr",
"-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored",
]
for opt in options:
opt = opt.decode("utf-8")
if opt not in filtered_opts and "-arch=sm_" not in opt:
if "--include-path=" in opt:
cmd += " " + opt.replace(
"--include-path=",
"-I",
)
else:
cmd += " " + opt
cmd_template = (
"echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
% source_buffer_host
)
cmd = SubstituteTemplate(
cmd_template,
{
"cuda_install_path": CUDA_INSTALL_PATH,
"options": compilation_options.get_str(),
},
)
tempfile.tempdir = "./"
temp = tempfile.NamedTemporaryFile(
@ -375,7 +351,6 @@ class ArtifactManager:
# save the cubin
operation_key = []
operation_list = []
requires_nvcc_hostlib_compilation = False
for operation in operations:
# step 1: get kernel string as key
key = operation.rt_module.emit() + operation.procedural_name() + self.backend
@ -398,17 +373,9 @@ class ArtifactManager:
operation_list.append(operation.rt_module)
operation_key.append(key)
# Creating the Params structures for certain 3.0 kernels currently requires CUDA. For these cases, use NVCC to generate
# the PyCUTLASS host-side library. Otherwise, g++ will be used.
if isinstance(operation, GemmOperationUniversal) and operation.api == ApiVersion.v3x:
if self.backend == "nvrtc":
raise RuntimeError("CUTLASS 3 kernels currently require NVCC for compilation.")
requires_nvcc_hostlib_compilation = True
if len(operation_list) > 0:
cubin_image, host_lib, host_file = self.emit_compile_(
operation_list, compile_options, requires_nvcc_hostlib_compilation)
operation_list, compile_options)
err, module = cuda.cuModuleLoadData(cubin_image)
if err != cuda.CUresult.CUDA_SUCCESS:

View File

@ -43,10 +43,10 @@
template<typename T, typename L, typename TF>
void bind_tensor_ref_view(py::module &m, std::string name) {
py::class_<cutlass::TensorRef<T, L>>(m, ("TensorRef" + name).c_str())
.def("__init__", [](cutlass::TensorRef<T, L>& tensor_ref, int64_t address, const L& layout_ ) {
.def(py::init([](int64_t address, const L& layout_ ) {
T* ptr = reinterpret_cast< T*>(address);
new (&tensor_ref) cutlass::TensorRef<T, L>(ptr, layout_);
})
return new cutlass::TensorRef<T, L>(ptr, layout_);
}))
.def("data", [](cutlass::TensorRef<T, L>& tensor_ref) {
T* ptr = tensor_ref.data();
return int64_t(ptr);

View File

@ -29,9 +29,12 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#################################################################################################
import copy
import os
from pybind11.setup_helpers import Pybind11Extension
import setuptools
from setuptools import setup
from setuptools.command.build_ext import build_ext
def _cutlass_path_from_dir() -> str:
@ -61,31 +64,57 @@ cutlass_path = (
else _cutlass_path_from_dir()
)
cuda_install_path = (
os.getenv('CUDA_INSTALL_PATH')
if os.getenv('CUDA_INSTALL_PATH') is not None
else _cuda_install_path_from_nvcc()
)
ext_modules = []
try:
from pybind11.setup_helpers import Pybind11Extension, build_ext
include_dirs = [
cutlass_path + '/include',
cuda_install_path + '/include',
cutlass_path + '/tools/util/include',
cutlass_path + '/test',
]
class BuildExtension(build_ext):
"""
Wrapper around `build_ext` to use NVCC when compiling the CUTLASS Python-C++ bindings.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
ext_modules = [
Pybind11Extension('cutlass_bindings',
['cutlass/cpp/cutlass_bindings.cpp'],
include_dirs=include_dirs,
extra_compile_args=['-fpermissive', '-w', '-std=c++17', '-DCUTLASS_PYTHON_HOST_CC=1'])
]
except ImportError:
pass
def build_extensions(self):
original_compile = self.compiler._compile
def custom_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
"""
Wrapper around build_ext.compiler._compile method
"""
postargs = copy.deepcopy(extra_postargs)
postargs = [f for f in postargs if f not in ['-g0', '-fvisibility=hidden']]
postargs.extend(["-Xcompiler='-fPIC'", "-Xcompiler='-g0'", "-Xcompiler='-O3'", '-x', 'cu'])
try:
original_compiler = self.compiler.compiler_so
self.compiler.set_executable('compiler_so', [f'{cuda_install_path}/bin/nvcc'])
original_compile(obj, src, ext, cc_args, postargs, pp_opts)
finally:
self.compiler.set_executable('compiler_so', original_compiler)
self.compiler._compile = custom_compile
super().build_extensions()
include_dirs = [
cutlass_path + '/include',
cuda_install_path + '/include',
cutlass_path + '/tools/util/include',
cutlass_path + '/test',
]
ext_modules = [
Pybind11Extension('cutlass_bindings',
['cutlass/cpp/cutlass_bindings.cpp'],
include_dirs=include_dirs,
extra_compile_args=['-Xcompiler="-fpermissive"', '-w', '-std=c++17'],
libraries=['cudart'])
]
setup(
@ -103,4 +132,7 @@ setup(
'treelib'
],
ext_modules=ext_modules,
cmdclass={
'build_ext': BuildExtension
}
)

View File

@ -41,6 +41,7 @@ add_custom_target(
cutlass_test_unit_gemm_device_tensorop_planar_complex
cutlass_test_unit_gemm_device_sparse_tensorop_sm80
cutlass_test_unit_gemv_device
cutlass_test_unit_gemv_device_strided_batched
cutlass_test_unit_gemm_device_tensorop_sm90
cutlass_test_unit_gemm_device_tensorop_cluster_multicast_sm90
)
@ -60,6 +61,7 @@ add_custom_target(
test_unit_gemm_device_tensorop_planar_complex
test_unit_gemm_device_sparse_tensorop_sm80
test_unit_gemv_device
test_unit_gemv_device_strided_batched
test_unit_gemm_device_tensorop_sm90
)
@ -498,6 +500,15 @@ cutlass_test_unit_add_executable(
gemv.cu
)
cutlass_test_unit_add_executable(
cutlass_test_unit_gemv_device_strided_batched
BATCH_SOURCES ON
BATCH_SIZE 4
gemv_strided_batched.cu
)
if (NOT CUDA_COMPILER MATCHES "[Cc]lang")
add_dependencies(

View File

@ -77,7 +77,8 @@ struct TestbedImpl {
using StrideA = typename Gemm::GemmKernel::StrideA;
using ElementB = typename Gemm::GemmKernel::ElementB;
using StrideB = typename Gemm::GemmKernel::StrideB;
using ElementC = typename Gemm::GemmKernel::ElementC;
using ElementC = std::conditional_t<std::is_void_v<typename Gemm::GemmKernel::ElementC>,
typename Gemm::GemmKernel::ElementD,typename Gemm::GemmKernel::ElementC>;
using StrideC = typename Gemm::GemmKernel::StrideC;
using ElementD = typename Gemm::GemmKernel::ElementD;
using StrideD = typename Gemm::GemmKernel::StrideD;

View File

@ -0,0 +1,490 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide strided batched GEMV interface
*/
#include <iostream>
#include <fstream>
#include <sstream>
#include "cutlass/cutlass.h"
#include "cutlass/gemm/kernel/gemv_strided_batched.h"
#include "cutlass/gemm/device/gemv_strided_batched.h"
#include "../../common/cutlass_unit_test.h"
#include "cutlass/util/host_tensor.h"
#include "cutlass/util/tensor_view_io.h"
#include "cutlass/util/distribution.h"
#include "cutlass/util/reference/host/tensor_fill.h"
#include "cutlass/util/reference/host/tensor_copy.h"
#include "cutlass/util/reference/host/tensor_compare.h"
#include "cutlass/util/reference/host/tensor_norm.h"
#include "cutlass/util/reference/host/gemm.h"
#include "cutlass/util/reference/host/gemm_complex.h"
#include "testbed_utils.h"
/////////////////////////////////////////////////////////////////////////////////////////////////
namespace test {
namespace gemm {
template <typename GemvStridedBatched>
class TestbedStridedBatchedGemv
{
public:
using ElementA = typename GemvStridedBatched::ElementA;
using LayoutA = typename GemvStridedBatched::LayoutA;
using ElementB = typename GemvStridedBatched::ElementB;
using ElementC = typename GemvStridedBatched::ElementC;
using ElementAccumulator = typename GemvStridedBatched::ElementAccumulator;
using ElementCompute = typename GemvStridedBatched::EpilogueOutputOp::ElementCompute;
using LayoutV = cutlass::layout::RowMajor;
private:
/// Initialization
cutlass::Distribution::Kind init_A;
cutlass::Distribution::Kind init_B;
cutlass::Distribution::Kind init_C;
uint64_t seed;
cutlass::HostTensor<ElementA, LayoutA> tensor_A;
cutlass::HostTensor<ElementB, LayoutV> tensor_B;
cutlass::HostTensor<ElementC, LayoutV> tensor_C;
cutlass::HostTensor<ElementC, LayoutV> tensor_D;
cutlass::HostTensor<ElementC, LayoutV> reference_D;
public:
//
// Methods
//
TestbedStridedBatchedGemv(
cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
uint64_t seed_ = 2023):
init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {}
/// Helper to initialize a tensor view
template <typename Element, typename Layout>
bool initialize_tensor(
cutlass::TensorView<Element, Layout> view,
cutlass::Distribution::Kind dist_kind,
uint64_t seed) {
if (dist_kind == cutlass::Distribution::Uniform) {
double scope_max, scope_min;
int bits_input = cutlass::sizeof_bits<Element>::value;
int bits_output = cutlass::sizeof_bits<typename GemvStridedBatched::ElementC>::value;
if (bits_input == 1) {
scope_max = 2;
scope_min = 0;
} else if (bits_input <= 8) {
scope_max = 2;
scope_min = -2;
} else if (bits_output == 16) {
scope_max = 5;
scope_min = -5;
} else {
scope_max = 8;
scope_min = -8;
}
cutlass::reference::host::TensorFillRandomUniform(
view, seed, scope_max, scope_min, 0);
}
else if (dist_kind == cutlass::Distribution::Identity) {
cutlass::reference::host::TensorFillIdentity(view);
}
else if (dist_kind == cutlass::Distribution::Gaussian) {
cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
}
else if (dist_kind == cutlass::Distribution::Sequential) {
cutlass::reference::host::BlockFillSequential(
view.data(), view.capacity());
}
else {
// TODO: Implement the rest
EXPECT_TRUE(false) << "Not implemented";
return false;
}
return true;
}
/// Initializes data structures
void initialize(
cutlass::MatrixCoord problem_size,
int32_t batch_count
) {
//
// Allocate the GEMV workspace
//
tensor_A.resize({batch_count * problem_size.row(), problem_size.column()});
tensor_B.resize({batch_count * problem_size.column(), 1});
tensor_C.resize({batch_count * problem_size.row(), 1});
tensor_D.resize({batch_count * problem_size.row(), 1});
reference_D.resize({batch_count * problem_size.row(), 1}, false);
EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 1));
EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2));
EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 3));
// It is possible to randomly initialize to all zeros, so override this with non-zeros
// in the upper left corner of each operand.
tensor_A.host_view().at({0, 0}) = typename GemvStridedBatched::ElementA(1);
tensor_B.host_view().at({0, 0}) = typename GemvStridedBatched::ElementB(1);
tensor_C.host_view().at({0, 0}) = typename GemvStridedBatched::ElementC(1);
cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
tensor_A.sync_device();
tensor_B.sync_device();
tensor_C.sync_device();
tensor_D.sync_device();
}
/// Compares computed reference with device reference and outputs to a file if incorrect
bool compare_reference(
cutlass::MatrixCoord problem_size,
ElementCompute alpha,
ElementCompute beta) {
tensor_D.sync_host();
EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view());
EXPECT_TRUE(passed) << " mismatched reference";
if (!passed) {
std::ofstream file("testbed_universal_errors.txt");
file
<< "problem: " << problem_size
<< ", alpha: " << alpha << ", beta: " << beta << "\n\n";
file
<< "A =\n" << tensor_A.host_view()
<< "\nB =\n" << tensor_B.host_view()
<< "\nC =\n" << tensor_C.host_view()
<< "\n\nReference =\n" << reference_D.host_view()
<< "\nComputed =\n" << tensor_D.host_view();
}
return passed;
}
/// Verifies the result
bool verify(
cutlass::MatrixCoord problem_size,
int32_t batch_count,
int64_t batch_stride_A,
int64_t batch_stride_B,
int64_t batch_stride_C,
int64_t batch_stride_D,
ElementCompute alpha,
ElementCompute beta) {
//
// Verify
//
cutlass::reference::host::GemmComplex<
typename GemvStridedBatched::ElementA, typename GemvStridedBatched::LayoutA,
typename GemvStridedBatched::ElementB, LayoutV,
typename GemvStridedBatched::ElementC, LayoutV,
ElementCompute, ElementAccumulator
>(
{problem_size.row(), 1, problem_size.column()},
alpha,
tensor_A.host_ref(),
GemvStridedBatched::kTransformA,
tensor_B.host_ref(),
GemvStridedBatched::kTransformB,
beta,
tensor_C.host_ref(),
reference_D.host_ref(),
ElementAccumulator(0),
batch_count,
batch_stride_A,
batch_stride_B,
batch_stride_C,
batch_stride_D
);
return compare_reference(problem_size, alpha, beta);
}
/// Runs one problem size
bool run(
cutlass::MatrixCoord problem_size,
int32_t batch_count,
int64_t batch_stride_A,
int64_t batch_stride_B,
int64_t batch_stride_C,
int64_t batch_stride_D,
ElementCompute alpha,
ElementCompute beta) {
this->initialize(problem_size, batch_count);
//
// Initialize the GEMV operator
//
typename GemvStridedBatched::Arguments arguments{
problem_size,
batch_count,
{alpha, beta},
tensor_A.device_ref(),
tensor_B.device_data(),
tensor_C.device_data(),
tensor_D.device_data(),
batch_stride_A,
batch_stride_B,
batch_stride_C,
batch_stride_D
};
GemvStridedBatched gemm_op;
cutlass::Status status = gemm_op.can_implement(arguments);
EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
size_t workspace_size = GemvStridedBatched::get_workspace_size(arguments);
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
status = gemm_op.initialize(arguments, workspace.get());
EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
//
// Run the GEMV
//
status = gemm_op();
EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
//
// Verify
//
bool passed = this->verify(
problem_size,
batch_count,
batch_stride_A,
batch_stride_B,
batch_stride_C,
batch_stride_D,
alpha,
beta);
return passed;
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
template <typename GemvStridedBatched>
bool TestAllGemv() {
using ElementCompute = typename GemvStridedBatched::EpilogueOutputOp::ElementCompute;
int Batch[] = {
1, 520, 1314
};
int M[] = {
1, 5, 16
};
int K[] = {
8, 128, 256
};
double Alpha[] = {
1, 1.25
};
double Beta[] = {
0, 1, 1.25
};
for (int b : Batch) {
for (int m : M) {
for (int k : K) {
for (double alpha : Alpha) {
for (double beta : Beta) {
TestbedStridedBatchedGemv<GemvStridedBatched> testbed;
if (!testbed.run(
{m, k},
b,
m * k,
k,
m,
m,
ElementCompute(alpha),
ElementCompute(beta))) {
return false;
}
}
}
}
}
}
return true;
}
} // namespace gemm
} // namespace test
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM50_Device_StridedBatchedGemv_f16n_f16_f16_simt_f32, Simple) {
using ElementInput = cutlass::half_t;
using ElementOutput = cutlass::half_t;
using LayoutA = cutlass::layout::RowMajor;
using ElementAccumulator = float;
int const kElementsPerAccess = 8;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
ElementOutput,
1,
ElementAccumulator,
ElementAccumulator>;
using GemvStridedBatched = cutlass::gemm::device::GemvStridedBatched<
cutlass::gemm::kernel::GemvStridedBatched<
ElementInput, // Element A
LayoutA, // Layout A
ElementInput, // Element B
ElementOutput, // Element C
ElementAccumulator, // Element accumulator
kElementsPerAccess, // Element access granularity
EpilogueOp // Output operator
>>;
EXPECT_TRUE(test::gemm::TestAllGemv<GemvStridedBatched>());
}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM50_Device_StridedBatchedGemv_f32n_f32_f32_simt_f32, Simple) {
using ElementInput = float;
using ElementOutput = float;
using LayoutA = cutlass::layout::RowMajor;
using ElementAccumulator = float;
int const kElementsPerAccess = 4;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
ElementOutput,
1,
ElementAccumulator,
ElementAccumulator>;
using GemvStridedBatched = cutlass::gemm::device::GemvStridedBatched<
cutlass::gemm::kernel::GemvStridedBatched<
ElementInput, // Element A
LayoutA, // Layout A
ElementInput, // Element B
ElementOutput, // Element C
ElementAccumulator, // Element accumulator
kElementsPerAccess, // Element access granularity
EpilogueOp // Output operator
>>;
EXPECT_TRUE(test::gemm::TestAllGemv<GemvStridedBatched>());}
/////////////////////////////////////////////////////////////////////////////////////////////////
TEST(SM50_Device_StridedBatchedGemv_f64n_f64_f64_simt_f64, Simple) {
using ElementInput = double;
using ElementOutput = double;
using LayoutA = cutlass::layout::RowMajor;
using ElementAccumulator = double;
int const kElementsPerAccess = 2;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
ElementOutput,
1,
ElementAccumulator,
ElementAccumulator>;
using GemvStridedBatched = cutlass::gemm::device::GemvStridedBatched<
cutlass::gemm::kernel::GemvStridedBatched<
ElementInput, // Element A
LayoutA, // Layout A
ElementInput, // Element B
ElementOutput, // Element C
ElementAccumulator, // Element accumulator
kElementsPerAccess, // Element access granularity
EpilogueOp // Output operator
>>;
EXPECT_TRUE(test::gemm::TestAllGemv<GemvStridedBatched>());}
/////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -182,9 +182,9 @@ class GemmOperation:
ar = self.arch,
op = opcode_class_name,
ex = self.extended_name_3x(),
tbm = self.tile_description.threadblock_shape[0],
tbn = self.tile_description.threadblock_shape[1],
tbk = self.tile_description.threadblock_shape[2],
tbm = self.tile_description.tile_shape[0],
tbn = self.tile_description.tile_shape[1],
tbk = self.tile_description.tile_shape[2],
cm = self.tile_description.cluster_shape[0],
cn = self.tile_description.cluster_shape[1],
ck = self.tile_description.cluster_shape[2],
@ -640,7 +640,7 @@ class EmitGemmUniversal3xInstance:
using ${operation_name}_epilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
${arch}, ${opcode_class},
cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
cutlass::epilogue::collective::EpilogueTileAuto,
${element_accumulator}, ${element_epilogue},
@ -655,7 +655,7 @@ using ${operation_name}_mainloop =
${element_a}, ${layout_a}, ${align_a},
${element_b}, ${layout_b}, ${align_b},
${element_accumulator},
cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
cutlass::gemm::collective::StageCountAutoCarveout<
sizeof(typename ${operation_name}_epilogue::SharedStorage)>,
@ -686,14 +686,14 @@ ${compile_guard_end}
#
def emit(self, operation):
threadblock_shape = operation.tile_description.threadblock_shape
tile_shape = operation.tile_description.tile_shape
warp_count = operation.tile_description.warp_count
# stage count set to zero indicates builder automatic stage selection
if operation.tile_description.stages > 0:
stage_count_string = f"cutlass::gemm::collective::StageCount<{str(operation.tile_description.stages)}>"
else:
stage_count_string = "cutlass::gemm::collective::StageCountAuto"
warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
warp_shape = [tile_shape[idx] // warp_count[idx] for idx in range(3)]
instance_layout_A, instance_layout_B, instance_layout_C , instance_layout_D = \
(operation.A.layout, operation.B.layout, operation.C.layout, operation.D.layout)
@ -727,9 +727,9 @@ ${compile_guard_end}
'element_accumulator': DataTypeTag[operation.accumulator_type()],
'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
'arch': "cutlass::arch::Sm%d" % operation.arch,
'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
'tile_shape_m': str(operation.tile_description.tile_shape[0]),
'tile_shape_n': str(operation.tile_description.tile_shape[1]),
'tile_shape_k': str(operation.tile_description.tile_shape[2]),
'cluster_m': str(operation.tile_description.cluster_shape[0]),
'cluster_n': str(operation.tile_description.cluster_shape[1]),
'cluster_k': str(operation.tile_description.cluster_shape[2]),

View File

@ -91,22 +91,21 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
# Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts
def CreateGemmUniversal3xOperator(
manifest, layouts, tile_descriptions, data_type,
manifest, layouts, tile_descriptions, data_types,
schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]],
complex_transforms=None,
epilogue_functor=EpilogueFunctor.LinearCombination,
swizzling_functor=SwizzlingFunctor.Identity1):
if type(data_types) is dict:
data_types = [data_types]
for s in schedules:
assert(len(s) == 2)
if complex_transforms is None:
complex_transforms = [(ComplexTransform.none, ComplexTransform.none), ]
element_a = data_type["a_type"]
element_b = data_type["b_type"]
element_c = data_type["c_type"]
element_d = data_type["d_type"]
element_acc = data_type["acc_type"]
element_epilogue = data_type.get("epi_type", element_acc)
operations = []
# by default, only generate the largest tile and largest alignment
@ -115,23 +114,25 @@ def CreateGemmUniversal3xOperator(
for layout in layouts:
for tile_description in tile_descriptions:
for complex_transform in complex_transforms:
for kernel_schedule, epilogue_schedule in schedules:
A = TensorDescription(
element_a, layout[0][0], layout[0][1], complex_transform[0])
B = TensorDescription(
element_b, layout[1][0], layout[1][1], complex_transform[1])
for data_type in data_types:
for complex_transform in complex_transforms:
for kernel_schedule, epilogue_schedule in schedules:
A = TensorDescription(
data_type["a_type"], layout[0][0], layout[0][1], complex_transform[0])
B = TensorDescription(
data_type["b_type"], layout[1][0], layout[1][1], complex_transform[1])
C = TensorDescription(element_c, layout[2][0], layout[2][1])
D = TensorDescription(element_d, layout[2][0], layout[2][1])
C = TensorDescription(data_type["c_type"], layout[2][0], layout[2][1])
D = TensorDescription(data_type["d_type"], layout[2][0], layout[2][1])
operation = GemmOperation(
GemmKind.Universal3x, tile_description.minimum_compute_capability,
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, D,
kernel_schedule, epilogue_schedule)
element_compute = data_type.get("epi_type", data_type["acc_type"])
operation = GemmOperation(
GemmKind.Universal3x, tile_description.minimum_compute_capability,
tile_description, A, B, C, element_compute, epilogue_functor, swizzling_functor, D,
kernel_schedule, epilogue_schedule)
manifest.append(operation)
operations.append(operation)
manifest.append(operation)
operations.append(operation)
return operations
@ -4118,21 +4119,19 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
layout[2][1] = 8
if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
kernel_schedules = [
KernelScheduleType.ScheduleAuto,
KernelScheduleType.TmaWarpSpecializedCooperative,
KernelScheduleType.TmaWarpSpecializedPingpong,
KernelScheduleType.TmaWarpSpecialized
schedules = [
[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
]
else:
kernel_schedules = [
KernelScheduleType.ScheduleAuto,
KernelScheduleType.TmaWarpSpecialized
schedules = [
[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
# TmaWarpSpecializedCooperative and TmaWarpSpecializedPingpong require CUDA version >= 12.1 for optimal performance.
]
schedules = [[s, EpilogueScheduleType.ScheduleAuto] for s in kernel_schedules]
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)
# persistent kernels with TMA epilogues
@ -4140,6 +4139,11 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
# Emit instance without C allocation+load
data_type["c_type"] = DataType.void
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
# for mixed precision kernels, also generate kernels that write output matrix in the A/B format
# Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
@ -4166,6 +4170,11 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
# Emit instance without C allocation+load
data_type_mixed["c_type"] = DataType.void
CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
[[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized],
[KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
#
def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
@ -4212,19 +4221,32 @@ def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
"acc_type" : math_inst.element_accumulator,
"epi_type" : math_inst.element_accumulator
}
schedules = [
[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
]
# TMA kernels with TT layout use EpilogueTransposed (NoSmemWarpSpecialized with swapped strides),
# because they use NN kernels underneath and transposing its epilogue will get the correct output
schedules_transposed_epilogue = [
[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed],
[KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.EpilogueTransposed]
]
# TMA kernels with TN or NN layout
layouts_tf32_tn_nn = [layouts_tf32[0], layouts_tf32[2]]
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_tf32)
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_tf32, schedules)
# TMA kernels with NT layout, only support 64x128x32 tile for now.
layouts_tf32_nt = [layouts_tf32[3]]
tile_64x128x32_descriptions = [tile_descriptions[0], tile_descriptions[1], tile_descriptions[2]]
CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_tf32)
tile_128x128x32_descriptions = [tile_descriptions[3], tile_descriptions[4], tile_descriptions[5]]
CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_tf32, schedules)
CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_128x128x32_descriptions, data_type_tf32, [schedules[1]])
# TMA kernels with TT layout use EpilogueTransposed, because swapping NN kernel and transposed its epilogue will get the kernel
layouts_tf32_tt = [layouts_tf32[1]]
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_tf32,
[[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed]])
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_tf32, schedules_transposed_epilogue)
# F32 kernel share same settings with tf32 I/O kernels excluding data type
data_type_f32 = {
@ -4236,10 +4258,10 @@ def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
"epi_type" : DataType.f32
}
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_f32)
CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_f32)
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_f32,
[[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed]])
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_f32, schedules)
CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_f32, schedules)
CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_128x128x32_descriptions, data_type_f32, [schedules[1]])
CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_f32, schedules_transposed_epilogue)
#
def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
@ -4910,8 +4932,8 @@ def GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version):
#
def GenerateSM90(manifest, cuda_version):
GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version)
GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version)
GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version)
GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version)
GenerateSM90_TensorOp_1684(manifest, cuda_version)
GenerateSM90_TensorOp_1684_complex(manifest, cuda_version)
GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version)

View File

@ -40,6 +40,7 @@ GeneratorTargetNames = {
#
class DataType(enum.Enum):
void = enum_auto() # primarily used to disable C tensor for epilogues
b1 = enum_auto()
u4 = enum_auto()
u8 = enum_auto()
@ -89,6 +90,7 @@ ShortDataTypeNames = {
#
DataTypeNames = {
DataType.void: "void",
DataType.b1: "b1",
DataType.u4: "u4",
DataType.u8: "u8",
@ -121,10 +123,11 @@ DataTypeNames = {
DataType.cs8: "cs8",
DataType.cs16: "cs16",
DataType.cs32: "cs32",
DataType.cs64: "cs64",
DataType.cs64: "cs64",
}
DataTypeTag = {
DataType.void: "void",
DataType.b1: "cutlass::uint1b_t",
DataType.u4: "cutlass::uint4b_t",
DataType.u8: "uint8_t",
@ -161,6 +164,7 @@ DataTypeTag = {
}
DataTypeSize = {
DataType.void: 0,
DataType.b1: 1,
DataType.u4: 4,
DataType.u8: 8,
@ -765,6 +769,7 @@ class TileDescription:
def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute, cluster_shape = [1,1,1]):
self.threadblock_shape = threadblock_shape
self.tile_shape = threadblock_shape
self.stages = stages
self.warp_count = warp_count
self.math_instruction = math_instruction

View File

@ -240,7 +240,9 @@ class Manifest:
self.kernel_filter_list = []
else:
self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
_LOGGER.info("Using {filter_count} kernel filters from {filter_file}".format(
filter_count = len(self.kernel_filter_list),
filter_file = args.kernel_filter_file))
self.operation_count = 0
self.operations_by_name = {}
@ -311,19 +313,29 @@ class Manifest:
# compare against the include list
for name_substr in self.kernel_names:
if self._filter_string_matches(name_substr, name):
_LOGGER.debug("Kernel {kernel} included due to filter string '{filt}'.".format(
kernel = operation.procedural_name(),
filt = name_substr))
enabled = True
break
# compare against the exclude list
for name_substr in self.ignore_kernel_names:
if self._filter_string_matches(name_substr, name):
_LOGGER.debug("Kernel {kernel} ignored due to filter string '{filt}'.".format(
kernel = operation.procedural_name(),
filt = name_substr))
enabled = False
break
if len(self.kernel_filter_list) > 0:
enabled = False
if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
enabled = True
_LOGGER.debug("Kernel {kernel} matched via kernel filter file.".format(kernel = operation.procedural_name()))
enabled = True
else:
_LOGGER.debug("Kernel {kernel} culled due to no match in kernel filter file.".format(kernel = operation.procedural_name()))
enabled = False
# todo: filter based on compute data type
return enabled
@ -389,6 +401,8 @@ class Manifest:
for operation_kind, configurations in self.operations.items():
with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
for configuration_name, operations in configurations.items():
_LOGGER.info("Emitting {config} with {num_ops} operations.".format(
config = configuration_name, num_ops = len(operations)))
operation_kind_emitter.emit(configuration_name, operations)
source_files += operation_kind_emitter.source_files

View File

@ -64,6 +64,10 @@ namespace library {
template <typename T> struct NumericTypeMap;
template <> struct NumericTypeMap<void> {
static NumericTypeID const kId = NumericTypeID::kVoid;
};
template <> struct NumericTypeMap<cutlass::uint1b_t> {
static NumericTypeID const kId = NumericTypeID::kB1;
};

View File

@ -107,15 +107,17 @@ set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM --operation=Symm --provid
cutlass_add_executable_tests(
test_profiler cutlass_profiler
DEPENDEES test_all
TEST_COMMAND_OPTIONS
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM
TEST_COMMAND_OPTIONS
GEMM
CONV2D
CONV3D
SPGEMM
RANK_K
RANK_2K
TRMM
SYMM
TEST_COMMAND_OPTIONS_PREFIX
CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_
DISABLE_EXECUTABLE_INSTALL_RULE
)

View File

@ -124,7 +124,7 @@ int CutlassProfiler::operator()() {
options_.execution_mode == ExecutionMode::kTrace) {
// Profiles all operations
profile_();
return profile_();
}
else if (options_.execution_mode == ExecutionMode::kEnumerate) {
// Enumerates all operations
@ -157,7 +157,7 @@ int CutlassProfiler::profile_() {
if (result) {
return result;
}
}
}
}

View File

@ -462,6 +462,13 @@ size_t DeviceAllocation::bytes() const {
/// Copies from an equivalent-sized tensor in device memory
void DeviceAllocation::copy_from_device(void const *ptr) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping copy of size 0 allocation\n";
#endif
return;
}
cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyDeviceToDevice);
if (result != cudaSuccess) {
throw std::runtime_error("Failed device-to-device copy");
@ -470,22 +477,43 @@ void DeviceAllocation::copy_from_device(void const *ptr) {
/// Copies from an equivalent-sized tensor in device memory
void DeviceAllocation::copy_from_host(void const *ptr) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping copy of size 0 allocation\n";
#endif
return;
}
cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyHostToDevice);
if (result != cudaSuccess) {
throw std::runtime_error("Failed device-to-device copy");
throw std::runtime_error("Failed host-to-device copy");
}
}
/// Copies from an equivalent-sized tensor in device memory
void DeviceAllocation::copy_to_host(void *ptr) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping copy of size 0 allocation\n";
#endif
return;
}
cudaError_t result = cudaMemcpy(ptr, data(), bytes(), cudaMemcpyDeviceToHost);
if (result != cudaSuccess) {
throw std::runtime_error("Failed device-to-device copy");
throw std::runtime_error("Failed device-to-host copy");
}
}
void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
if (!good()) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping initialization of size 0 allocation\n";
#endif
return;
}
if (!data()) {
throw std::runtime_error("Attempting to initialize invalid allocation.");
}
@ -690,7 +718,14 @@ void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
}
void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
if (!good()) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping initialization of size 0 allocation\n";
#endif
return;
}
if (!data()) {
throw std::runtime_error("Attempting to initialize invalid allocation.");
}
@ -699,7 +734,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
switch (type_) {
case library::NumericTypeID::kFE4M3:
cutlass::reference::host::BlockFillRandom<cutlass::float_e4m3_t>(
reinterpret_cast<cutlass::float_e4m3_t *>(pointer_),
reinterpret_cast<cutlass::float_e4m3_t *>(host_data.data()),
capacity_,
seed,
dist
@ -707,7 +742,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
break;
case library::NumericTypeID::kFE5M2:
cutlass::reference::host::BlockFillRandom<cutlass::float_e5m2_t>(
reinterpret_cast<cutlass::float_e5m2_t *>(pointer_),
reinterpret_cast<cutlass::float_e5m2_t *>(host_data.data()),
capacity_,
seed,
dist
@ -904,7 +939,14 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
}
void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSizeInBits) {
if (!good()) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping initialization of size 0 allocation\n";
#endif
return;
}
if (!data()) {
throw std::runtime_error("Attempting to initialize invalid allocation.");
}
@ -934,7 +976,14 @@ void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSiz
}
void DeviceAllocation::initialize_random_sparsemeta_host(int seed, int MetaSizeInBits) {
if (!good()) {
if (!bytes()) {
#ifndef NDEBUG
std::cout << "Skipping initialization of size 0 allocation\n";
#endif
return;
}
if (!data()) {
throw std::runtime_error("Attempting to initialize invalid allocation.");
}

View File

@ -68,6 +68,7 @@ GemmOperationProfiler::GemmOperationProfiler(Options const &options):
{ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
{ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
{ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
{ArgumentTypeID::kTensor, {"D"}, "Tensor storing the D output"},
{ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
{ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
{ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "Variant of split K mode(serial, parallel)"},
@ -206,6 +207,10 @@ Status GemmOperationProfiler::GemmProblem::parse(
return Status::kErrorInvalidProblem;
}
if (!tensor_description_satisfies(operation_desc.D, "D", problem_space, problem)) {
return Status::kErrorInvalidProblem;
}
if (!arg_as_scalar(
this->alpha,
operation_desc.element_epilogue,
@ -307,6 +312,9 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
set_argument(result, "C", problem_space,
std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));
set_argument(result, "D", problem_space,
std::string(library::to_string(operation_desc.D.element)) + ":" + library::to_string(operation_desc.D.layout));
set_argument(result, "m", problem_space, m);
set_argument(result, "n", problem_space, n);
set_argument(result, "k", problem_space, k);
@ -537,8 +545,6 @@ Status GemmOperationProfiler::initialize_workspace(
problem_.batch_count * gemm_workspace_.problem_count
);
gemm_workspace_.Reference->copy_from_device(gemm_workspace_.C->data());
// NOTE: the leading non-batch strides are duplicated here for 3.0 API kernels
gemm_workspace_.arguments.problem_size = {int(problem_.m), int(problem_.n), int(problem_.k)};
gemm_workspace_.arguments.batch_count = problem_.batch_count;

View File

@ -270,17 +270,17 @@ int OperationProfiler::profile_all(
ProblemSpace::Iterator problem_it = problem_space.begin();
ProblemSpace::Iterator problem_end = problem_space.end();
bool continue_profiling = true, internal_error = false;
bool continue_profiling = true;
int retval = 0;
// For each problem in problem space
for (; continue_profiling && problem_it != problem_end; ++problem_it) {
ProblemSpace::Problem problem = problem_it.at();
report.next_problem();
// For each operation in manifest
for (auto const & operation_ptr : manifest) {
int matched_operation_count = 0;
for (auto const& operation_ptr : manifest) {
library::Operation const *operation = operation_ptr.get();
@ -292,8 +292,8 @@ int OperationProfiler::profile_all(
// Execute compatible cutlass operations if they satisfy the current device's compute capability
if (operation->description().kind == kind_ &&
operation->description().provider == library::Provider::kCUTLASS &&
options.device.compute_capability() >= min_cc &&
operation->description().provider == library::Provider::kCUTLASS &&
options.device.compute_capability() >= min_cc &&
options.device.compute_capability() <= max_cc) {
std::string operation_name(operation->description().name);
@ -320,7 +320,10 @@ int OperationProfiler::profile_all(
if (!filtered_by_name || !satisfies(operation->description(), problem_space, problem)) {
continue;
}
// we have found a kernel match, so increment the counter for match kernels
++matched_operation_count;
// A. Initialize configuration
Status status = this->initialize_configuration(
options,
@ -374,7 +377,6 @@ int OperationProfiler::profile_all(
//
// B. Verify CUTLASS
if (continue_profiling && options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
continue_profiling = this->verify_cutlass(
@ -426,10 +428,18 @@ int OperationProfiler::profile_all(
if (!continue_profiling) {
break;
}
}
}
// If we did not find any kernels that match our filters and error_on_no_match was set, report an error
if (options.profiling.error_on_no_match && matched_operation_count <= 0) {
#if !NDEBUG
std::cout << "Error: No matching kernels found with kernel selection filters [--error_on_no_match]" << std::endl;
#endif
retval = 1;
}
}
return internal_error ? 1 : 0;
return retval;
}
///////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -706,10 +706,12 @@ Options::Options(cutlass::CommandLine const &cmdline):
}
else if (cmdline.check_cmd_line_flag("kernels")) {
cmdline.get_cmd_line_arguments("kernels", operation_names);
profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
}
if (cmdline.check_cmd_line_flag("ignore-kernels")) {
cmdline.get_cmd_line_arguments("ignore-kernels", excluded_operation_names);
profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
}
// Prevent launches on the device for anything other than CUTLASS operation

View File

@ -196,6 +196,9 @@ public:
/// If true, profiling is actually conducted.
bool enabled;
/// If true, profiling returns an error code if no kernels are found to match the filters.
bool error_on_no_match = false;
/// List of providers of each functionality to be profiled
ProviderVector providers;