@ -25,7 +25,7 @@
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cmake_policy(SET CMP0112 NEW)
|
||||
include(GNUInstallDirs)
|
||||
|
||||
find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED)
|
||||
@ -94,6 +94,9 @@ file(GLOB_RECURSE GENERATOR_PYTHON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOU
|
||||
# set cutlass generator compiler version to filter kernels in the generator not supported by a specific toolkit.
|
||||
set(CUTLASS_GENERATOR_CUDA_COMPILER_VERSION ${CMAKE_CUDA_COMPILER_VERSION})
|
||||
|
||||
# --log-level is set to DEBUG to enable printing information about which kernels were excluded
|
||||
# from generation in /tools/library/scripts/manifest.py. To avoid having this information appear
|
||||
# in ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log, set this parameter to INFO
|
||||
execute_process(
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/scripts
|
||||
COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/scripts/generator.py
|
||||
@ -112,6 +115,8 @@ execute_process(
|
||||
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log
|
||||
)
|
||||
|
||||
message(STATUS "Completed generation of library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log for more information.")
|
||||
|
||||
if(NOT cutlass_lib_INSTANCE_GENERATION_RESULT EQUAL 0)
|
||||
message(FATAL_ERROR "Error generating library instances. See ${CMAKE_CURRENT_BINARY_DIR}/library_instance_generation.log")
|
||||
endif()
|
||||
|
||||
@ -102,6 +102,12 @@ template <typename OperatorClass> struct ArchMap<arch::Sm90, OperatorClass> {
|
||||
static int const kMax = 1024;
|
||||
};
|
||||
|
||||
// Arch conditional WGMMA
|
||||
template <> struct ArchMap<arch::Sm90, arch::OpClassTensorOp> {
|
||||
static int const kMin = 90;
|
||||
static int const kMax = 90;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace library
|
||||
|
||||
@ -178,7 +178,7 @@ public:
|
||||
int K, /// GEMM K dimension
|
||||
|
||||
NumericTypeID element_compute, /// Data type of internal accumulation
|
||||
|
||||
|
||||
NumericTypeID element_scalar, /// Data type of alpha/beta scalars
|
||||
|
||||
void const *alpha, /// Pointer to alpha scalar
|
||||
@ -186,29 +186,29 @@ public:
|
||||
NumericTypeID element_A, /// Data type of A matrix elements
|
||||
LayoutTypeID layout_A, /// Layout of A matrix
|
||||
ComplexTransform transform_A, /// Complex transformation applied to A matrix - ignored for real-valued matrices
|
||||
|
||||
void const * ptr_A, /// Pointer to A matrix in Global Memory
|
||||
int64_t lda, /// Leading dimension of A matrix
|
||||
int64_t lda, /// Leading dimension of A matrix
|
||||
|
||||
NumericTypeID element_B, /// Data type of B matrix elements
|
||||
LayoutTypeID layout_B, /// Layout of B matrix
|
||||
ComplexTransform transform_B, /// Complex transformation applied to B matrix - ignored for real-valued matrices
|
||||
|
||||
void const * ptr_B, /// Pointer to B matrix in Global Memory
|
||||
int64_t ldb, /// Leading dimension of B matrix
|
||||
int64_t ldb, /// Leading dimension of B matrix
|
||||
|
||||
void const * beta, /// Pointer to beta scalar
|
||||
|
||||
NumericTypeID element_C, /// Data type of C and D matrices
|
||||
|
||||
NumericTypeID element_C, /// Data type of C matrix
|
||||
LayoutTypeID layout_C, /// Layout of D matrix
|
||||
void const * ptr_C, /// Pointer to C matrix
|
||||
int64_t ldc, /// Leading dimension of C matrix
|
||||
int64_t ldc, /// Leading dimension of C matrix
|
||||
|
||||
NumericTypeID element_D, /// Data type of D matrix
|
||||
LayoutTypeID layout_D, /// Layout of D matrix
|
||||
void * ptr_D, /// Pointer to D matrix
|
||||
int64_t ldd, /// Leading dimension of D matrix
|
||||
|
||||
int64_t ldd, /// Leading dimension of D matrix
|
||||
|
||||
int batch_count = 1, /// Batch count or number of split-K slices
|
||||
|
||||
|
||||
int64_t batch_stride_A = 0, /// Batch stride of A operand
|
||||
int64_t batch_stride_B = 0, /// Batch stride of B operand
|
||||
int64_t batch_stride_C = 0, /// Batch stride of C operand
|
||||
|
||||
@ -114,6 +114,8 @@ enum class NumericTypeID {
|
||||
kS16,
|
||||
kS32,
|
||||
kS64,
|
||||
kFE4M3,
|
||||
kFE5M2,
|
||||
kF16,
|
||||
kBF16,
|
||||
kTF32,
|
||||
@ -474,9 +476,12 @@ struct GemmDescription : public OperationDescription {
|
||||
/// Describes the B operand
|
||||
TensorDescription B;
|
||||
|
||||
/// Describes the source and destination matrices
|
||||
/// Describes the source matrix
|
||||
TensorDescription C;
|
||||
|
||||
/// Describes the destination matrix
|
||||
TensorDescription D;
|
||||
|
||||
/// Describes the sparse meta matrices
|
||||
TensorDescription E;
|
||||
|
||||
@ -501,6 +506,7 @@ struct GemmDescription : public OperationDescription {
|
||||
TensorDescription const &A = TensorDescription(),
|
||||
TensorDescription const &B = TensorDescription(),
|
||||
TensorDescription const &C = TensorDescription(),
|
||||
TensorDescription const &D = TensorDescription(),
|
||||
NumericTypeID element_epilogue = NumericTypeID::kInvalid,
|
||||
SplitKMode split_k_mode = SplitKMode::kNone,
|
||||
ComplexTransform transform_A = ComplexTransform::kNone,
|
||||
@ -510,6 +516,7 @@ struct GemmDescription : public OperationDescription {
|
||||
A(A),
|
||||
B(B),
|
||||
C(C),
|
||||
D(D),
|
||||
element_epilogue(element_epilogue),
|
||||
split_k_mode(split_k_mode),
|
||||
transform_A(transform_A),
|
||||
@ -527,13 +534,14 @@ struct SparseGemmDescription : public GemmDescription {
|
||||
TensorDescription const &A = TensorDescription(),
|
||||
TensorDescription const &B = TensorDescription(),
|
||||
TensorDescription const &C = TensorDescription(),
|
||||
TensorDescription const &D = TensorDescription(),
|
||||
TensorDescription const &E = TensorDescription(),
|
||||
NumericTypeID element_epilogue = NumericTypeID::kInvalid,
|
||||
SplitKMode split_k_mode = SplitKMode::kNone,
|
||||
ComplexTransform transform_A = ComplexTransform::kNone,
|
||||
ComplexTransform transform_B = ComplexTransform::kNone
|
||||
):
|
||||
GemmDescription(gemm_kind, A, B, C, element_epilogue, split_k_mode, transform_A, transform_B)
|
||||
GemmDescription(gemm_kind, A, B, C, D, element_epilogue, split_k_mode, transform_A, transform_B)
|
||||
{this->E = E;}
|
||||
};
|
||||
|
||||
@ -1019,6 +1027,9 @@ struct GemmUniversalArguments {
|
||||
int64_t batch_stride_B;
|
||||
int64_t batch_stride_C;
|
||||
int64_t batch_stride_D;
|
||||
|
||||
// Needed for some 3.x kernels
|
||||
int sm_count;
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -66,6 +66,9 @@ struct GemmFunctionalKey {
|
||||
LayoutTypeID layout_B;
|
||||
ComplexTransform transform_B;
|
||||
NumericTypeID element_C;
|
||||
LayoutTypeID layout_C;
|
||||
NumericTypeID element_D;
|
||||
LayoutTypeID layout_D;
|
||||
|
||||
//
|
||||
// Methods
|
||||
@ -83,7 +86,10 @@ struct GemmFunctionalKey {
|
||||
NumericTypeID element_B = NumericTypeID::kF16,
|
||||
LayoutTypeID layout_B = LayoutTypeID::kColumnMajor,
|
||||
ComplexTransform transform_B = ComplexTransform::kNone,
|
||||
NumericTypeID element_C = NumericTypeID::kF16
|
||||
NumericTypeID element_C = NumericTypeID::kF16,
|
||||
LayoutTypeID layout_C = LayoutTypeID::kColumnMajor,
|
||||
NumericTypeID element_D = NumericTypeID::kF16,
|
||||
LayoutTypeID layout_D = LayoutTypeID::kColumnMajor
|
||||
):
|
||||
provider(provider),
|
||||
gemm_kind(gemm_kind),
|
||||
@ -95,7 +101,10 @@ struct GemmFunctionalKey {
|
||||
element_B(element_B),
|
||||
layout_B(layout_B),
|
||||
transform_B(transform_B),
|
||||
element_C(element_C)
|
||||
element_C(element_C),
|
||||
layout_C(layout_C),
|
||||
element_D(element_D),
|
||||
layout_D(layout_D)
|
||||
{ }
|
||||
|
||||
inline
|
||||
@ -111,7 +120,10 @@ struct GemmFunctionalKey {
|
||||
(element_B == rhs.element_B) &&
|
||||
(layout_B == rhs.layout_B) &&
|
||||
(transform_B == rhs.transform_B) &&
|
||||
(element_C == rhs.element_C);
|
||||
(element_C == rhs.element_C) &&
|
||||
(layout_C == rhs.layout_C) &&
|
||||
(element_D == rhs.element_D) &&
|
||||
(layout_D == rhs.layout_D);
|
||||
}
|
||||
|
||||
inline
|
||||
@ -137,6 +149,9 @@ std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey
|
||||
<< " layout_B: " << to_string(k.layout_B) << "\n"
|
||||
<< " transform_B: " << to_string(k.transform_B) << "\n"
|
||||
<< " element_C: " << to_string(k.element_C) << "\n"
|
||||
<< " layout_C: " << to_string(k.layout_C) << "\n"
|
||||
<< " element_D: " << to_string(k.element_D) << "\n"
|
||||
<< " layout_D: " << to_string(k.layout_D) << "\n"
|
||||
<< "}";
|
||||
|
||||
return out;
|
||||
@ -157,18 +172,21 @@ struct GemmFunctionalKeyHasher {
|
||||
size_t operator()(GemmFunctionalKey const &key) const {
|
||||
IntHash hash;
|
||||
|
||||
return
|
||||
rotl(hash(int(key.provider)), 1) ^
|
||||
rotl(hash(int(key.gemm_kind)), 2) ^
|
||||
return
|
||||
rotl(hash(int(key.provider)), 1) ^
|
||||
rotl(hash(int(key.gemm_kind)), 2) ^
|
||||
rotl(hash(int(key.element_compute)), 3) ^
|
||||
rotl(hash(int(key.element_scalar)), 4) ^
|
||||
rotl(hash(int(key.element_A)), 5) ^
|
||||
rotl(hash(int(key.layout_A)), 6) ^
|
||||
rotl(hash(int(key.transform_A)), 7) ^
|
||||
rotl(hash(int(key.element_B)), 8) ^
|
||||
rotl(hash(int(key.layout_B)), 9) ^
|
||||
rotl(hash(int(key.transform_B)), 10) ^
|
||||
rotl(hash(int(key.element_C)), 11);
|
||||
rotl(hash(int(key.element_scalar)), 4) ^
|
||||
rotl(hash(int(key.element_A)), 5) ^
|
||||
rotl(hash(int(key.layout_A)), 6) ^
|
||||
rotl(hash(int(key.transform_A)), 7) ^
|
||||
rotl(hash(int(key.element_B)), 8) ^
|
||||
rotl(hash(int(key.layout_B)), 9) ^
|
||||
rotl(hash(int(key.transform_B)), 10) ^
|
||||
rotl(hash(int(key.element_C)), 11) ^
|
||||
rotl(hash(int(key.layout_C)), 12) ^
|
||||
rotl(hash(int(key.element_D)), 13) ^
|
||||
rotl(hash(int(key.layout_D)), 14);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -23,7 +23,8 @@ from library import *
|
||||
class GemmOperation:
|
||||
#
|
||||
def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
|
||||
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8):
|
||||
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, D = None,
|
||||
kernel_schedule = KernelScheduleType.ScheduleAuto, epilogue_schedule = EpilogueScheduleType.ScheduleAuto):
|
||||
|
||||
self.prefix = "3x" if gemm_kind == GemmKind.Universal3x else ""
|
||||
self.operation_kind = OperationKind.Gemm
|
||||
@ -33,6 +34,15 @@ class GemmOperation:
|
||||
self.A = A
|
||||
self.B = B
|
||||
self.C = C
|
||||
self.D = D
|
||||
if self.D == None:
|
||||
self.D = self.C
|
||||
|
||||
if gemm_kind != GemmKind.Universal3x:
|
||||
assert(kernel_schedule == KernelScheduleType.ScheduleAuto)
|
||||
assert(epilogue_schedule == EpilogueScheduleType.ScheduleAuto)
|
||||
self.kernel_schedule = kernel_schedule
|
||||
self.epilogue_schedule = epilogue_schedule
|
||||
self.element_epilogue = element_epilogue
|
||||
self.epilogue_functor = epilogue_functor
|
||||
self.swizzling_functor = swizzling_functor
|
||||
@ -122,11 +132,12 @@ class GemmOperation:
|
||||
|
||||
def extended_name_3x(self):
|
||||
'''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
|
||||
extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}".format(
|
||||
extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
|
||||
element_a = DataTypeNames[self.A.element],
|
||||
element_b = DataTypeNames[self.B.element],
|
||||
element_acc = DataTypeNames[self.tile_description.math_instruction.element_accumulator],
|
||||
element_c = DataTypeNames[self.C.element],
|
||||
element_d = DataTypeNames[self.D.element],
|
||||
core_name = self.core_name())
|
||||
return extended_name
|
||||
|
||||
@ -152,12 +163,20 @@ class GemmOperation:
|
||||
ShortLayoutTypeNames[self.B.layout],
|
||||
ShortLayoutTypeNames[self.C.layout])
|
||||
|
||||
# Generates a short string representing underlying kernel schedule type
|
||||
def kernel_schedule_name_3x(self):
|
||||
return KernelScheduleSuffixes[self.kernel_schedule]
|
||||
|
||||
# Generates a short string representing underlying epilogue schedule type
|
||||
def epilogue_schedule_name_3x(self):
|
||||
return EpilogueScheduleSuffixes[self.epilogue_schedule]
|
||||
|
||||
# Generates the full kernel function name
|
||||
def procedural_name(self):
|
||||
''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
|
||||
opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
|
||||
if self.arch >= 90:
|
||||
kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{l}_{s}_align{al}"
|
||||
kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{l}_{s}_align{al}{k}{e}"
|
||||
return kernel_name_template.format(
|
||||
p = self.prefix,
|
||||
ar = self.arch,
|
||||
@ -171,7 +190,9 @@ class GemmOperation:
|
||||
ck = self.tile_description.cluster_shape[2],
|
||||
l = self.tile_description.stages,
|
||||
s = self.layout_name_3x(),
|
||||
al = str(max(self.A.alignment, self.B.alignment)))
|
||||
al = str(max(self.A.alignment, self.B.alignment)),
|
||||
k = self.kernel_schedule_name_3x(),
|
||||
e = self.epilogue_schedule_name_3x())
|
||||
else:
|
||||
threadblock = self.tile_description.procedural_name()
|
||||
return "cutlass{p}_{op}_{ex}_{tb}_{l}_align{a}".format(
|
||||
@ -604,8 +625,7 @@ class EmitGemmUniversal3xInstance:
|
||||
"cutlass/numeric_types.h",
|
||||
"cutlass/gemm/kernel/gemm_universal.hpp",
|
||||
"cutlass/gemm/collective/collective_builder.hpp",
|
||||
"cutlass/epilogue/collective/default_epilogue.hpp",
|
||||
"cutlass/epilogue/thread/linear_combination.h",
|
||||
"cutlass/epilogue/collective/collective_builder.hpp",
|
||||
]
|
||||
self.builtin_epilogue_functor_template = """
|
||||
${epilogue_functor}<
|
||||
@ -617,6 +637,18 @@ class EmitGemmUniversal3xInstance:
|
||||
"""
|
||||
self.gemm_template = """
|
||||
|
||||
using ${operation_name}_epilogue =
|
||||
typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
${arch}, ${opcode_class},
|
||||
cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
|
||||
cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
${element_accumulator}, ${element_epilogue},
|
||||
${element_c}, ${layout_c}, ${align_c},
|
||||
${element_d}, ${layout_d}, ${align_d},
|
||||
${epilogue_schedule}
|
||||
>::CollectiveOp;
|
||||
|
||||
using ${operation_name}_mainloop =
|
||||
typename cutlass::gemm::collective::CollectiveBuilder<
|
||||
${arch}, ${opcode_class},
|
||||
@ -625,18 +657,11 @@ using ${operation_name}_mainloop =
|
||||
${element_accumulator},
|
||||
cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
|
||||
cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
|
||||
cutlass::gemm::collective::StageCountAuto,
|
||||
cutlass::gemm::collective::KernelScheduleAuto
|
||||
cutlass::gemm::collective::StageCountAutoCarveout<
|
||||
sizeof(typename ${operation_name}_epilogue::SharedStorage)>,
|
||||
${kernel_schedule}
|
||||
>::CollectiveOp;
|
||||
|
||||
using ${operation_name}_epilogue =
|
||||
cutlass::epilogue::collective::DefaultEpilogue<
|
||||
cutlass::gemm::TagToStrideC_t<${layout_c}>,
|
||||
cutlass::gemm::TagToStrideC_t<${layout_c}>,
|
||||
cutlass::epilogue::thread::LinearCombination<
|
||||
${element_c}, ${epilogue_vector_length}, ${element_accumulator}, ${element_epilogue}>
|
||||
>;
|
||||
|
||||
// Gemm operator ${operation_name}
|
||||
using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
|
||||
cute::Shape<int,int,int,int>,
|
||||
@ -670,8 +695,8 @@ ${compile_guard_end}
|
||||
stage_count_string = "cutlass::gemm::collective::StageCountAuto"
|
||||
warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
|
||||
|
||||
instance_layout_A, instance_layout_B, instance_layout_C = \
|
||||
(operation.A.layout, operation.B.layout, operation.C.layout)
|
||||
instance_layout_A, instance_layout_B, instance_layout_C , instance_layout_D = \
|
||||
(operation.A.layout, operation.B.layout, operation.C.layout, operation.D.layout)
|
||||
|
||||
# 3.0 profiler integration only supports trivial epilogues for now
|
||||
epilogue_vector_length = 1
|
||||
@ -697,6 +722,8 @@ ${compile_guard_end}
|
||||
'layout_b': LayoutTag[instance_layout_B],
|
||||
'element_c': DataTypeTag[operation.C.element],
|
||||
'layout_c': LayoutTag[instance_layout_C],
|
||||
'element_d': DataTypeTag[operation.D.element],
|
||||
'layout_d': LayoutTag[instance_layout_D],
|
||||
'element_accumulator': DataTypeTag[operation.accumulator_type()],
|
||||
'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
|
||||
'arch': "cutlass::arch::Sm%d" % operation.arch,
|
||||
@ -712,10 +739,14 @@ ${compile_guard_end}
|
||||
'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
|
||||
'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
|
||||
'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
|
||||
'kernel_schedule' : str(KernelScheduleTag[operation.kernel_schedule]),
|
||||
'epilogue_schedule' : str(EpilogueScheduleTag[operation.epilogue_schedule]),
|
||||
'epilogue_functor': epilogue_functor,
|
||||
'stages': stage_count_string,
|
||||
'align_a': str(operation.A.alignment),
|
||||
'align_b': str(operation.B.alignment),
|
||||
'align_c': str(operation.C.alignment),
|
||||
'align_d': str(operation.C.alignment),
|
||||
'transform_a': ComplexTransformTag[operation.A.complex_transform],
|
||||
'transform_b': ComplexTransformTag[operation.B.complex_transform],
|
||||
'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -361,6 +361,58 @@ ShortComplexLayoutNames = {
|
||||
(LayoutType.RowMajor, ComplexTransform.conj): 'h'
|
||||
}
|
||||
|
||||
###################################################################################################
|
||||
class KernelScheduleType(enum.Enum):
|
||||
ScheduleAuto = enum_auto()
|
||||
Multistage = enum_auto()
|
||||
Tma = enum_auto()
|
||||
TmaWarpSpecialized = enum_auto()
|
||||
TmaWarpSpecializedPingpong = enum_auto()
|
||||
TmaWarpSpecializedCooperative = enum_auto()
|
||||
#
|
||||
KernelScheduleTag = {
|
||||
KernelScheduleType.ScheduleAuto: 'cutlass::gemm::collective::KernelScheduleAuto',
|
||||
KernelScheduleType.Multistage: 'cutlass::gemm::KernelMultistage',
|
||||
KernelScheduleType.Tma: 'cutlass::gemm::KernelTma',
|
||||
KernelScheduleType.TmaWarpSpecialized: 'cutlass::gemm::KernelTmaWarpSpecialized',
|
||||
KernelScheduleType.TmaWarpSpecializedPingpong: 'cutlass::gemm::KernelTmaWarpSpecializedPingpong',
|
||||
KernelScheduleType.TmaWarpSpecializedCooperative: 'cutlass::gemm::KernelTmaWarpSpecializedCooperative',
|
||||
}
|
||||
|
||||
#
|
||||
KernelScheduleSuffixes = {
|
||||
KernelScheduleType.ScheduleAuto: '',
|
||||
KernelScheduleType.Multistage: '_cpasync',
|
||||
KernelScheduleType.Tma: '_unspecialized',
|
||||
KernelScheduleType.TmaWarpSpecialized: '_warpspecialized',
|
||||
KernelScheduleType.TmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
|
||||
KernelScheduleType.TmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
|
||||
}
|
||||
|
||||
class EpilogueScheduleType(enum.Enum):
|
||||
ScheduleAuto = enum_auto()
|
||||
EpilogueTransposed = enum_auto()
|
||||
NoSmemWarpSpecialized = enum_auto()
|
||||
TmaWarpSpecialized = enum_auto()
|
||||
TmaWarpSpecializedCooperative = enum_auto()
|
||||
#
|
||||
EpilogueScheduleTag = {
|
||||
EpilogueScheduleType.ScheduleAuto: 'cutlass::epilogue::collective::EpilogueScheduleAuto',
|
||||
EpilogueScheduleType.EpilogueTransposed: 'cutlass::gemm::EpilogueTransposed',
|
||||
EpilogueScheduleType.NoSmemWarpSpecialized: 'cutlass::epilogue::NoSmemWarpSpecialized',
|
||||
EpilogueScheduleType.TmaWarpSpecialized: 'cutlass::epilogue::TmaWarpSpecialized',
|
||||
EpilogueScheduleType.TmaWarpSpecializedCooperative: 'cutlass::epilogue::TmaWarpSpecializedCooperative',
|
||||
}
|
||||
|
||||
#
|
||||
EpilogueScheduleSuffixes = {
|
||||
EpilogueScheduleType.ScheduleAuto: '',
|
||||
EpilogueScheduleType.EpilogueTransposed: '',
|
||||
EpilogueScheduleType.NoSmemWarpSpecialized: '_epi_nosmem',
|
||||
EpilogueScheduleType.TmaWarpSpecialized: '_epi_tma',
|
||||
EpilogueScheduleType.TmaWarpSpecializedCooperative: '_epi_tma',
|
||||
}
|
||||
|
||||
###################################################################################################
|
||||
|
||||
#
|
||||
|
||||
@ -1,143 +0,0 @@
|
||||
# PyCUTLASS: CUTLASS Python Interface
|
||||
|
||||
PyCUTLASS is a python interface of CUTLASS C++ template library. PyCUTLASS takes user-defined operation descriptions, emits C++ code, and compiles it with `nvcc` or `nvrtc`. It also provides wrappers for user-provide arguments from [numpy](https://numpy.org/), [torch](https://pytorch.org/), and [cupy](https://github.com/cupy/cupy) and encode them to kernel's parameters.
|
||||
|
||||
```python
|
||||
import pycutlass
|
||||
from pycutlass import *
|
||||
import torch
|
||||
|
||||
pycutlass.get_memory_pool(2**8, 2**32)
|
||||
|
||||
math_inst = MathInstruction(
|
||||
[1, 1, 1], cutlass.float32, cutlass.float32, cutlass.float32,
|
||||
cutlass.OpClass.Simt, MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
[128, 128, 8], 4, [2, 4, 1],
|
||||
math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
cutlass.float32, cutlass.RowMajor, 1
|
||||
)
|
||||
|
||||
B = TensorDescription(
|
||||
cutlass.float32, cutlass.RowMajor, 1
|
||||
)
|
||||
|
||||
C = TensorDescription(
|
||||
cutlass.float32, cutlass.RowMajor, 1
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(cutlass.float32, 1, cutlass.float32, cutlass.float32)
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
pycutlass.compiler.add_module([operation,])
|
||||
|
||||
problem_size = cutlass.gemm.GemmCoord(512, 256, 128)
|
||||
|
||||
tensor_A = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.k()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
|
||||
tensor_B = torch.ceil(torch.empty(size=(problem_size.k(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
|
||||
tensor_C = torch.ceil(torch.empty(size=(problem_size.m(), problem_size.n()), dtype=torch.float32, device="cuda").uniform_(-8.5, 7.5))
|
||||
tensor_D = torch.empty_like(tensor_C)
|
||||
|
||||
|
||||
alpha = 1.0
|
||||
beta = 0.0
|
||||
|
||||
arguments = GemmArguments(
|
||||
operation=operation, problem_size=problem_size,
|
||||
A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
|
||||
output_op=operation.epilogue_type(alpha, beta),
|
||||
gemm_mode=cutlass.gemm.Mode.Gemm, split_k_splices=1
|
||||
)
|
||||
|
||||
operation.run(arguments)
|
||||
|
||||
arguments.sync()
|
||||
|
||||
tensor_D_ref = alpha * tensor_A @ tensor_B + beta * tensor_C
|
||||
|
||||
assert torch.equal(tensor_D, tensor_D_ref)
|
||||
```
|
||||
PyCUTLASS also provides infrastructures for profiling, compiled artifact management, and pool memory manager
|
||||
|
||||
## Supported Features
|
||||
PyCUTLASS currently supports following operations:
|
||||
* GEMM with mode {Serial, Parallel Split K, Batched GEMM, Array GEMM}, op class {SIMT, TensorCore}, data type {int8, f16, bf16, f32, f64}, layout {RowMajor, ColumnMajor, Row/ColumnMajorInterleaved<32> for int8}, math operation {MultiplyAdd, MultiplyAddFastF16, MultiplyAddFastBF16, MultiplyAddFastF32}, swizzling functions {IdentitySwizzle<1,2,4,8>, HorizontalSwizzle, BatchedIdentitySwizzle}, and epilogue {LinearCombination, LinearCombinationClamp}
|
||||
* GEMM grouped with op class {SIMT, TensorCore}, data type {int8, f16, bf16, f32, f64}, layout {RowMajor, ColumnMajor}, math operation {MultiplyAdd, MultiplyAddFastF16, MultiplyAddFastBF16, MultiplyAddFastF32}, scheduling mode {Host, Device}, and epilogue {LinearCombination, LinearCombinationClamp}.
|
||||
* Conv2d with {Fprop, Dgrad, Wgrad}, op class {SIMT, TensorCore}, data type {int8, f16, bf16, f32, f64}, layout {Tensor NHWC, TensorNC32HW32 and TensorC32RSK for int8}, math operation {MultiplyAdd, MultiplyAddFastF16, MultiplyAddFastBF16, MultiplyAddFastF32}, split-k mode {Parallel, Serial}, and epilogue {LinearCombination, LinearCombinationClamp}
|
||||
|
||||
The tiling size of above operations can also be customized.
|
||||
|
||||
## Installation
|
||||
|
||||
### Using Docker
|
||||
We recommend using one of our provided Docker images for using PyCUTLASS.
|
||||
|
||||
**To run CUTLASS 3 GEMM kernels targeting the NVIDIA Hopper architecture via PyCUTLASS,** you can use an included [Dockerfile](docker/Dockerfile-cuda12.0) based on the NGC CUDA 12.0 container:
|
||||
```shell
|
||||
docker build -t pycutlass-cuda12.0:latest -f docker/Dockerfile-cuda12.0 .
|
||||
docker run --gpus all -it --rm pycutlass-cuda12.0:latest
|
||||
```
|
||||
Note that this Docker container does not include CuPy or PyTorch, and, thus, will not be able to run PyCUTLASS examples that
|
||||
leverage these packages.
|
||||
|
||||
**To run CUTLASS 2.x kernels targeting pre-SM90 architectures via PyCUTLASS,** you can use an included [Dockerfile](docker/Dockerfile-cuda11.8-pytorch) based on an NGC PyTorch container:
|
||||
```shell
|
||||
docker build -t pycutlass-cuda11.8-pytorch:latest -f docker/Dockerfile-cuda11.8-pytorch .
|
||||
docker run --gpus all -it --rm pycutlass-cuda11.8-pytorch:latest
|
||||
```
|
||||
|
||||
### Environment variables
|
||||
PyCUTLASS requires two environment variables:
|
||||
* `CUTLASS_PATH`: the root directory of CUTLASS. You can set this from the location at which you cloned CUTLASS via: `export CUTLASS_PATH=$(pwd)`.
|
||||
* `CUDA_INSTALL_PATH`: the directory where cuda toolkit is installed. If running in bash with `nvcc` installed under a CUDA toolkit, you can set this to the location of your `nvcc` installation via: `export CUDA_INSTALL_PATH=$(which nvcc | awk -F'/bin/nvcc' '{print $1}')`
|
||||
|
||||
After setting these two environment variables, PyCUTLASS can be installed with
|
||||
```shell
|
||||
cd $CUTLASS_PATH/tools/library/scripts/pycutlass && bash build.sh
|
||||
```
|
||||
|
||||
## Examples
|
||||
Examples can be found in [$CUTLASS_PATH/examples/40_cutlass_py](examples/40_cutlass_py)
|
||||
|
||||
## Test
|
||||
The test cases are listed in `$CUTLASS_PATH//tools/library/scripts/pycutlass/test`. The unit test can be run with
|
||||
```shell
|
||||
# Each of these tests are only supported on devices with compute capability of SM80. For other devices,
|
||||
# see the basic examples in $CUTLASS_PATH/examples/40_cutlass_py
|
||||
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/test/unit && python test_sm80.py
|
||||
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/test/example && bash run_all_example.sh
|
||||
```
|
||||
|
||||
## build documentation
|
||||
Run
|
||||
```shell
|
||||
bash build_doc.sh
|
||||
```
|
||||
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Issue 1: permission denied
|
||||
Building PyCUTLASS requires installing dependencies to python. So conda could an option if you don't have permission.
|
||||
|
||||
### Issue 2: rmm: module not found
|
||||
PyCUTLASS manages the device memory with [RMM](https://github.com/rapidsai/rmm). Our `build.sh` automatically pull the [rmm branch-22.08](https://github.com/rapidsai/rmm/tree/branch-22.08) from github and build it from source. The rmm is allocated at `$CUTLASS_PATH/tools/library/scripts/pycutlass/rmm`. It requires `cmake > 3.20.1`. If the build fails, it can be manually fixed with the following steps:
|
||||
```shell
|
||||
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/rmm && ./build.sh librmm rmm
|
||||
|
||||
cd $CUTLASS_PATH/tools/library/scripts/pycutlass/rmm/python
|
||||
python setup.py build_ext --inplace
|
||||
python setup.py install
|
||||
```
|
||||
To test whether rmm is successfully installed, try `import rmm`. For other issues related to rmm, please check https://github.com/rapidsai/rmm/issues.
|
||||
@ -1,36 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
pip install -U pybind11
|
||||
git clone https://github.com/google/googletest.git
|
||||
python setup.py develop --user
|
||||
python setup.py rmm
|
||||
@ -1,36 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
pip install enum-tools
|
||||
pip install sphinx-toolbox
|
||||
pip install m2r2
|
||||
sphinx-build -b html docs/source/ docs/build/html
|
||||
@ -1,40 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
FROM nvcr.io/nvidia/pytorch:22.11-py3
|
||||
|
||||
RUN chmod ugo+rwx /home
|
||||
RUN pip uninstall -y rmm
|
||||
RUN pip install rmm-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
|
||||
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||||
ENV LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
|
||||
ENV CUDA_INSTALL_PATH=/usr/local/cuda
|
||||
@ -1,46 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
FROM nvcr.io/nvidia/cuda:12.0.0-devel-ubuntu20.04
|
||||
|
||||
RUN apt-get update
|
||||
RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
|
||||
RUN apt-get install -y git cmake vim python3 python3-pip
|
||||
RUN ln -s /usr/bin/python3 /usr/bin/python
|
||||
RUN chmod ugo+rwx /home
|
||||
RUN pip install numpy==1.23
|
||||
RUN pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
|
||||
RUN pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
|
||||
RUN pip install cugraph-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
|
||||
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
|
||||
ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu/:$LIBRARY_PATH
|
||||
ENV CUDA_INSTALL_PATH=/usr/local/cuda
|
||||
@ -1,52 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
@ -1,35 +0,0 @@
|
||||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=source
|
||||
set BUILDDIR=build
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.https://www.sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
||||
@ -1,96 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
# import os
|
||||
# import sys
|
||||
# sys.path.insert(0, os.path.abspath('.'))
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'PyCutlass'
|
||||
copyright = '2022, Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall'
|
||||
author = 'Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.duration',
|
||||
'sphinx.ext.doctest',
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.intersphinx',
|
||||
'enum_tools.autoenum',
|
||||
'sphinx.ext.autosummary',
|
||||
'm2r2'
|
||||
]
|
||||
|
||||
source_suffix = [".rst", ".md"]
|
||||
|
||||
autosummary_generate = True
|
||||
autosummary_imported_members = True
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = []
|
||||
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'bizstyle'
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
# html_static_path = ['_static']
|
||||
@ -1,13 +0,0 @@
|
||||
CONV2D Operation
|
||||
================
|
||||
|
||||
.. autoclass:: pycutlass.Conv2dOperation
|
||||
:special-members:
|
||||
:members: run
|
||||
:exclude-members: __weakref__, configuration_name, core_name, extended_name, procedural_name
|
||||
|
||||
.. autoclass:: pycutlass.Conv2dArguments
|
||||
:special-members:
|
||||
:members:
|
||||
:exclude-members: initialize
|
||||
:show-inheritance:
|
||||
@ -1,100 +0,0 @@
|
||||
cutlass
|
||||
=======
|
||||
|
||||
.. rubric:: Operator Classification
|
||||
|
||||
.. autoclass:: cutlass.OpClass
|
||||
:members:
|
||||
|
||||
.. rubric:: GEMM Layout
|
||||
|
||||
.. autoclass:: cutlass.RowMajor
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.ColumnMajor
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.RowMajorInterleaved32
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.ColumnMajorInterleaved32
|
||||
:members:
|
||||
|
||||
.. rubric:: Conv Layout
|
||||
|
||||
.. autoclass:: cutlass.TensorNHWC
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.TensorNC32HW32
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.TensorC32RSK32
|
||||
:members:
|
||||
|
||||
.. rubric:: Threadblock Swizzle
|
||||
|
||||
.. autoclass:: cutlass.dim3
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.IdentitySwizzle1
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.IdentitySwizzle2
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.IdentitySwizzle4
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.IdentitySwizzle8
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.HorizontalSwizzle
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.BatchedIdentitySwizzle
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.StridedDgradIdentitySwizzle1
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.StridedDgradIdentitySwizzle4
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.StridedDgradHorizontalSwizzle
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. rubric:: Coordinates
|
||||
|
||||
.. autoclass:: cutlass.Tensor4DCoord
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.Tensor3DCoord
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.MatrixCoord
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
|
||||
.. rubric:: Convolution
|
||||
|
||||
.. autoclass:: cutlass.conv.Operator
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.conv.IteratorAlgorithm
|
||||
:members:
|
||||
|
||||
.. autoclass:: cutlass.conv.StrideSupport
|
||||
:members:
|
||||
@ -1,18 +0,0 @@
|
||||
GEMM Operation
|
||||
==============
|
||||
|
||||
.. autoclass:: pycutlass.GemmOperationUniversal
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: pycutlass.GemmOperationGrouped
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: pycutlass.GemmArguments
|
||||
:special-members:
|
||||
:members:
|
||||
|
||||
.. autoclass:: pycutlass.GemmGroupedArguments
|
||||
:special-members:
|
||||
:members:
|
||||
@ -1,31 +0,0 @@
|
||||
.. PyCutlass documentation master file, created by
|
||||
sphinx-quickstart on Sun Jun 19 12:05:42 2022.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
CUTLASS Python Project Documentation
|
||||
=====================================
|
||||
.. mdinclude:: ../../README.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
|
||||
|
||||
|
||||
.. Indices and tables
|
||||
.. ==================
|
||||
|
||||
.. * :ref:`genindex`
|
||||
.. * :ref:`modindex`
|
||||
.. * :ref:`search`
|
||||
|
||||
|
||||
Indices
|
||||
==================
|
||||
.. toctree::
|
||||
user_guide
|
||||
visitor_tree
|
||||
gemm_op
|
||||
conv2d_op
|
||||
cutlass
|
||||
@ -1,225 +0,0 @@
|
||||
# Epilogue Visitor Tree
|
||||
The Epilogue Visitor Tree is an experimental feature that directly generates epilogues from user-provide python functions.
|
||||
|
||||
## Usage
|
||||
|
||||
The Epilogue Visitor tree support many different operations.
|
||||
|
||||
### Unary functions
|
||||
Epilogue Visitor Tree supports unary functions like activation functions. For example,
|
||||
```python
|
||||
class UnaryEpilogue_(EpilogueVisitTree):
|
||||
def __call__(
|
||||
self, accum: 'tensor', c: 'tensor',
|
||||
alpha: 'scalar', beta: 'scalar'):
|
||||
#
|
||||
T = leaky_relu.numpy(accum, 0.2)
|
||||
Z = alpha * T + beta * c
|
||||
return Z
|
||||
epilogue_functor = UnaryEpilogue_(
|
||||
epilogue_functor, tile_description, math_inst.element_accumulator,
|
||||
C.alignment, element_epilogue, C.element)
|
||||
```
|
||||
|
||||
### Broadcast Operation
|
||||
Epilogue Visitor Tree supports broadcasting row and column vectors to the whole output matrix. To use broadcast, you just need to specify whether the source vector is a `row` vector or a `column` vector. Here is an example.
|
||||
```python
|
||||
class ColumnBroadcast_(EpilogueVisitTree):
|
||||
def __call__(
|
||||
self, accum: 'tensor', c: 'tensor',
|
||||
vector: 'column', alpha: 'scalar', beta: 'scalar'):
|
||||
#
|
||||
T = accum + vector
|
||||
scale_T = leaky_relu.numpy(alpha * T, 0.2)
|
||||
Z = scale_T + beta * c
|
||||
return Z, T
|
||||
epilogue_functor = ColumnBroadcast_(
|
||||
epilogue_functor, tile_description, math_inst.element_accumulator,
|
||||
C.alignment, element_epilogue, C.element)
|
||||
```
|
||||
|
||||
### Reduction Operation
|
||||
|
||||
Epilogue Visitor Tree also supports row and column-wise reduction in each threadblock tile. The syntax for reduction is
|
||||
```python
|
||||
{reduction_output} = reduction_op({input_tensor}, {row|column}, {Add}, {threadblock_shape.n|threadblock_shape.m})
|
||||
```
|
||||
The `{row|column}` indicates whether the `row` vectors are reduced or the `column` vectors are reduction. The `{Add}` specifies the reduction operation. The `{threadblock_shape.n|threadblock_shape.m}` are the reduction lengths.
|
||||
|
||||
**Constraint**
|
||||
* The `{input_tensor}` can only be the name of source or intermediate result. `reduction_op(A + B, ...)` will not work, please use `C = A + B`, `reduction_op(C, ...)` instead.
|
||||
* The `{reduction_output}` cannot be used in the epilogue. It will be directly written to global memory after the reduction is done.
|
||||
```python
|
||||
class RowReduction_(EpilogueVisitTree):
|
||||
def __call__(
|
||||
self, accum: 'tensor', c: 'tensor',
|
||||
alpha: 'scalar', beta: 'scalar'):
|
||||
#
|
||||
D = alpha * accum + tanh.numpy(beta * c)
|
||||
reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
|
||||
return D, reduction
|
||||
epilogue_functor = RowReduction_(
|
||||
epilogue_functor, tile_description, math_inst.element_accumulator,
|
||||
C.alignment, element_epilogue, C.element)
|
||||
epilogue_functor.initialize()
|
||||
```
|
||||
|
||||
## Get output_op
|
||||
|
||||
As shown in the user guide, an `output_op` is required by the argument wrapper. We will take the `RowReduction_` as an example to show how to get `output_op`.
|
||||
```python
|
||||
class RowReduction_(EpilogueVisitTree):
|
||||
def __call__(
|
||||
self, accum: 'tensor', c: 'tensor',
|
||||
alpha: 'scalar', beta: 'scalar'):
|
||||
#
|
||||
D = alpha * accum + tanh.numpy(beta * c)
|
||||
reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
|
||||
return D, reduction
|
||||
epilogue_functor = RowReduction_(
|
||||
epilogue_functor, tile_description, math_inst.element_accumulator,
|
||||
C.alignment, element_epilogue, C.element)
|
||||
epilogue_functor.initialize()
|
||||
|
||||
cta_n = args.threadblock_shape[1]
|
||||
num_cta_n = (problem_size.n() + cta_n - 1) // cta_n
|
||||
reduction = np.zeros(shape=(args.batch * problem_size.m() * num_cta_n,), dtype=getattr(np, element_c))
|
||||
# get output op
|
||||
output_op = operation.epilogue_type(
|
||||
D=tensor_D, alpha=args.alpha, beta=args.beta, c=tensor_C, reduction=reduction, problem_size=[problem_size.m(), problem_size.n()]
|
||||
)
|
||||
```
|
||||
Like other epilogue functors such as `LinearCombination`, the output op for EpilogueVisitorTree is also created with `operation.epilogue_type(*)`. However, there are two differences:
|
||||
* The arguments need to be passed as keyword-arguments. The keywords are the argument names in `def __call__`.
|
||||
* An additional `problem_size=[problem_size.m(), problem_size.n()]` is required.
|
||||
|
||||
|
||||
## Add new Unary Operation (e.g. Activation Function)
|
||||
To add additional unary operation into epilogue visitor tree, a new unary op
|
||||
should be created for `VisitorOpUnary`. We will take `tanh` as an example.
|
||||
|
||||
### Step 1: define TanhVisitor
|
||||
|
||||
The visitor defines the parameters and computation required by the unary option.
|
||||
The unary operations are registered in [pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h](tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/unary_ops.h). But you can define it in any header file and include the header file in [pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h](tools/library/scripts/pycutlass/src/cpp/include/epilogue/epilogue_visitor_op/visitor_op_unary.h).
|
||||
|
||||
|
||||
* Two template arguments are required:
|
||||
* `T`: data type used to compute the unary operation
|
||||
* `N`: compute fragment length
|
||||
* We also need to provide the `Arguments` and `Params` structures. The `Arguments` will be assembled by [ctypes](https://docs.python.org/3/library/ctypes.html), the `Params` will be generated from `Arguments` automatically. If the unary function takes no argument, an integer like `int tmp` can be provide to ensure the correctness of ctypes.
|
||||
* The constructor can only take the `params` as the single argument.
|
||||
* The operation is defined in `Array<T, N> operator()(Array<T, N> const &frag) const `. On common way to do that is first define a scalar computation, and them use it for the fragment computation with an unrolled for-loop.
|
||||
* A guard function is required. If it returns `true`, it will disable all the children nodes of the unary node and return zeros to parent node. This is very helpful for multiplication with scalar while the scalar is `0`. For general cases, you can just return `true`.
|
||||
```c++
|
||||
// T: data type used to compute the unary operation
|
||||
// N: compute fragment length
|
||||
template <typename T, int N>
|
||||
struct TanhVisitor {
|
||||
/// Argument
|
||||
struct Arguments {
|
||||
// a placeholder argument to ensure correctness of ctypes
|
||||
int tmp;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(): tmp(0) { };
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(int tmp): tmp(tmp) { };
|
||||
};
|
||||
|
||||
/// Param
|
||||
struct Params {
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(){ };
|
||||
Params(Arguments const &args) { }
|
||||
};
|
||||
|
||||
/// Constructor
|
||||
CUTLASS_HOST_DEVICE
|
||||
TanhVisitor(Params const ¶ms) { }
|
||||
|
||||
// scalar operator
|
||||
CUTLASS_HOST_DEVICE
|
||||
T tanh_op(T const &scalar) const {
|
||||
return fast_tanh(scalar);
|
||||
}
|
||||
|
||||
/// vector operator
|
||||
CUTLASS_HOST_DEVICE
|
||||
Array<T, N> operator()(Array<T, N> const &frag) const {
|
||||
Array<T, N> y;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i=0; i < N; ++i) {
|
||||
y[i] = tanh_op(frag[i]);
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
// Guard
|
||||
CUTLASS_HOST_DEVICE
|
||||
bool guard() {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
### Step 2: register Tanh function
|
||||
After defining the function in C++, we need to register it in python. The class below gives an example.
|
||||
* The init function takes the data type `element_compute`, which will be the `T` in the C++ template.
|
||||
In the init function, we also generate the `_Arguments` class as a `ctypes.Structure`. It includes all the data members in the `TanhVisitor::Arguments`.
|
||||
* The `_Arguments` need to be registered as `self.argument_type` of `tanh` class.
|
||||
* A `emit` function is required to emit the namespace and typename of `TanhVisitor`.
|
||||
* A staticmethod as numpy reference is required to implement the python code to parse.
|
||||
|
||||
The built-in functions are defined in [pycutlass/src/pycutlass/epilogue.py](tools/library/scripts/pycutlass/src/pycutlass/epilogue.py). You can defined yours in any file as long as it can be found by [/pycutlass/src/pycutlass/parser.py](tools/library/scripts/pycutlass/src/pycutlass/parser.py).
|
||||
```python
|
||||
class tanh(ActivationFunctor):
|
||||
def __init__(self, element_compute) -> None:
|
||||
super().__init__()
|
||||
class _Arguments(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("tmp", ctypes.c_int)
|
||||
]
|
||||
def __init__(self, *args) -> None:
|
||||
self.tmp = 0
|
||||
self.argument_type = _Arguments
|
||||
|
||||
def emit(self):
|
||||
return "cutlass::TanhVisitor"
|
||||
|
||||
@staticmethod
|
||||
def numpy(x: np.ndarray):
|
||||
return np.tanh(x)
|
||||
```
|
||||
|
||||
### Step 3: Run the function
|
||||
Now the new unary op is ready to use. An epilogue visitor tree can be built with
|
||||
```python
|
||||
class RowReduction_(EpilogueVisitTree):
|
||||
def __call__(
|
||||
self, accum: NDArray['tensor', 'float32'], c: NDArray['tensor', 'float32'],
|
||||
alpha: 'float32', beta: 'float32'):
|
||||
#
|
||||
D = alpha * accum + tanh.numpy(beta * c)
|
||||
reduction = reduction_op(D, "row", "Add", args.threadblock_shape[1])
|
||||
return D, reduction
|
||||
epilogue_functor = RowReduction_(
|
||||
epilogue_functor, tile_description, math_inst.element_accumulator,
|
||||
C.alignment, element_epilogue, C.element)
|
||||
epilogue_functor.initialize()
|
||||
```
|
||||
|
||||
## Limitations and Future work
|
||||
|
||||
Although the Epilogue Visitor Tree brings great flexibility to epilogue construction, as the epilogue is formulated as a single tree, there are several limitations.
|
||||
* [Future Work] Serial and Parallel Split-K GEMM are not supported yet.
|
||||
* To support serial split-k, additional tree transformation pass is required to inject a `binaryOpNode(Add)` + `TensorInputNode` before each `TensorOutputNode` to fetch the partial sum back. The `semaphore` also needs to be passed into epilogue.
|
||||
* To support parallel split-k, an Reduction with visitor kernel is required.
|
||||
* [Future Work] Convolution and GEMM Grouped are not supported yet.
|
||||
* To support Conv2d and GEMM Grouped, corresponding *_with_visitor kernels are required.
|
||||
|
||||
* [Limitation] If the same node is used by two operations (except that one of them is reduction), the node and all its offsprings will be executed twice.
|
||||
* [Limitation] The result of reduction can only be used as the return value.
|
||||
@ -1,283 +0,0 @@
|
||||
# Basics of PyCUTLASS
|
||||
|
||||
PyCUTLASS handles the following things when launch the CUTLASS kernels
|
||||
* Memory management
|
||||
* Operation Description
|
||||
* Code emission and compilation
|
||||
* Arguments preprocessing
|
||||
* Kernel launching
|
||||
* Result Synchronization
|
||||
|
||||
## Memory management
|
||||
|
||||
PyCUTLASS uses [RMM](https://github.com/rapidsai/rmm) to manage device memory. At the beginning of the program, call
|
||||
```python
|
||||
pycutlass.get_memory_pool({init_pool_size_in_bytes}, {max_pool_size_in_bytes})
|
||||
```
|
||||
We also provide functions to query the allocated size.
|
||||
```python
|
||||
bytes = get_allocated_size()
|
||||
```
|
||||
|
||||
|
||||
## Operation Description
|
||||
PyCUTLASS provides operation description for GEMM, GEMM Grouped and Conv2d operations. These operation descriptions are assembled from four foundamental concepts
|
||||
* Math Instruction: math instruction executed in GPU cores
|
||||
* Tile Description: tiling sizes and pipeline stages
|
||||
* Operand Description: data type, layout, memory alignment
|
||||
* Epilogue Functor: epilogue function
|
||||
|
||||
### Math Instruction
|
||||
|
||||
The math instruction is defined as follows:
|
||||
```python
|
||||
math_inst = MathInstruction(
|
||||
{instruction_shape}, {element_a}, {element_b},
|
||||
{element_acc}, {opclass}, {math_operation}
|
||||
)
|
||||
```
|
||||
The `{instruction_shape}` and `{opclass}` defines the instruction size and type. The table below lists valid combinations. `{element_a}`, `{element_b}` define the source operand data type for each instructions, and `{element_acc}` defines the accumulator type. The `{math_operation}` defines the math operation applied.
|
||||
|
||||
|Opclass | element_a/element_b | element_acc | instruction_shape | math_operation |
|
||||
| -- | -- | -- | -- | -- |
|
||||
| cutlass.OpClass.TensorOp | cutlass.float64 | cutlass.float64 | [8, 8, 4] | MathOperation.multiply_add|
|
||||
| | cutlass.float32 cutlass.tfloat32, cutlass.float16 cutlass.bfloat16 | cutlass.float32 | [16, 8, 8] | MathOperation.multiply_add MathOperation.multiply_add_fast_f32 MathOperation.multiply_add_fast_f16 MathOperation.multiply_add_fast_bf16 |
|
||||
| | cutlass.float16 | cutlass.float16/cutlass.float32|[16, 8, 16]| MathOperation.multiply_add |
|
||||
| | cutlass.bfloat_16 | cutlass.float32 | [16, 8, 16]|MathOperation.multiply_add |
|
||||
| | cutlass.int8 | cutlass.int32 | [16, 8, 32] | MathOperation.multiply_add_saturate|
|
||||
|cutlass.OpClass.Simt| cutlass.float64 | cutlass.float64 | [1, 1, 1] | MathOperation.multiply_add |
|
||||
| | cutlass.float32 | cutlass.float32 | [1, 1, 1] | MathOperation.multiply_add |
|
||||
|
||||
The `cutlass.OpClass.TensorOp` indicates that the tensor core is used, while `cutlass.OpClass.Simt` uses the SIMT Core.
|
||||
|
||||
The `multiply_add_fast_f32` emulates fast accurate SGEMM kernel which is accelerated
|
||||
using Ampere Tensor Cores. More details can be found in [examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm](examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm).
|
||||
|
||||
### Tile Description
|
||||
The tile description describes the threadblock and warp tiling sizes, as well as the pipeline stages.
|
||||
```python
|
||||
tile_description = TileDescription(
|
||||
{threadblock_shape}, {stages}, {warp_count},
|
||||
math_inst
|
||||
)
|
||||
```
|
||||
The `{threadblock_shape}` is a list of 3 integers `[Tile_M, Tile_N, Tile_K]` that defines the threadblock tiling size. `{stages}` defines the number of software pipeline stages ([detail](https://developer.nvidia.com/blog/controlling-data-movement-to-boost-performance-on-ampere-architecture/)). `{warp_count}` defines the number of warps along `M`, `N`, and `K` dimension. I.e., with `{threadblock_shape}=[Tile_M, Tile_N, Tile_K]` and `{warp_count}=[W_M, W_N, W_K]`, the warp tile size would be `[Tile_M / W_M, Tile_N / W_N, Tile_K / W_K]`.
|
||||
|
||||
### Operand Description
|
||||
The Operand Description defines the data type, layout, and memory alignment of input tensor A, B, and C. The output D shares the same attributes with C. The description is as follows:
|
||||
```python
|
||||
A = TensorDescription(
|
||||
{element_a}, {layout_a}, {alignment_a}
|
||||
)
|
||||
|
||||
B = TensorDescription(
|
||||
{element_b}, {layout_b}, {alignment_b}
|
||||
)
|
||||
|
||||
C = TensorDescription(
|
||||
{element_c}, {layout_c}, {alignment_c}
|
||||
)
|
||||
```
|
||||
The table below lists the supported layout and data types for each operation
|
||||
| Operation | data type | layout |
|
||||
| -- | -- | -- |
|
||||
| GEMM, GEMM Grouped | cutlass.float64, cutlass.float32, cutlass.float16, cutlass.bfloat16 | cutlass.RowMajor, cutlass.ColumnMajor |
|
||||
| | cutlass.int8 | cutlass.RowMajor, cutlass.ColumnMajor, cutlass.RowMajorInterleaved32, cutlass.ColumnMajorInterleaved32|
|
||||
| Conv2d Fprop, Dgrad, Wgrad | cutlass.float64, cutlass.float32, cutlass.float16, cutlass.bfloat16 | cutlass.TensorNHWC |
|
||||
| Conv2d Fprop | cutlass.int8 | cutlass.TensorNHWC, cutlass.TensorNC32HW32, cutlass.TensorC32RSK32|
|
||||
|
||||
### Epilogue Functor
|
||||
The epilogue functor defines the epilogue executed after mainloop.
|
||||
We expose the following epilogue functors.
|
||||
| Epilogue Functor | Remark |
|
||||
| -- | -- |
|
||||
| LinearCombination | $D=\alpha \times Accm + \beta \times C$ |
|
||||
| LinearCombinationClamp | $D=\alpha \times Accm + \beta \times C$, Output is clamped to the maximum value of the data type output |
|
||||
| FastLinearCombinationClamp | $D=\alpha \times Accm + \beta \times C$, only used for problem size $K\le 256$ for cutlass.int8, with accumulator data type `cutlass.int32` and epilogue compute data type `cutlass.float32` |
|
||||
| LinearCombinationGeneric | $D = activation(\alpha \times Accm + \beta \times C)$, available activations include `relu`, `leaky_relu`, `tanh`, `sigmoid`, `silu`, `hardswish`, and `gelu` |
|
||||
|
||||
The epilogue functors can be created as follows
|
||||
```python
|
||||
# LinearCombination
|
||||
epilogue_functor = LinearCombination(
|
||||
element_C, alignment_c, element_acc, element_epilogue_compute
|
||||
)
|
||||
|
||||
# LinearCombinationClamp
|
||||
epilogue_functor = LinearCombinationClamp(
|
||||
element_C, alignment_c, element_acc, element_epilogue_compute
|
||||
)
|
||||
|
||||
# FastLinearCombinationClamp
|
||||
epilogue_functor = FastLinearCombinationClamp(
|
||||
element_C, alignment_c
|
||||
)
|
||||
|
||||
# LinearCombinationGeneric
|
||||
epilogue_functor = LinearCombinationGeneric(
|
||||
relu(element_epilogue_compute), element_C, alignment_c,
|
||||
element_acc, element_epilogue_compute
|
||||
)
|
||||
```
|
||||
|
||||
We also provides an experimental feature "Epilogue Visitor Tree" for GEMM operation. The details can be found in [EpilogueVisitorTree](tools/library/scripts/pycutlass/docs/source/md/EpilogueVisitorTree.md).
|
||||
|
||||
|
||||
### GEMM Operation
|
||||
|
||||
The GEMM Operation description can be created with
|
||||
```python
|
||||
operation = GemmOperationUniversal(
|
||||
{compute_capability}, tile_description,
|
||||
A, B, C, epilogue_functor,
|
||||
{swizzling_functor}, {visitor}
|
||||
)
|
||||
```
|
||||
* `{compute_capability}` is an integer indicates the compute capability of the GPU. For A100, it is 80.
|
||||
* `{swizzling_functor}` describes how threadblocks are scheduled on GPU. This is used to improve the L2 Locality ([detail](https://developer.nvidia.com/blog/optimizing-compute-shaders-for-l2-locality-using-thread-group-id-swizzling/)). Currently we support `cutlass.{IdentitySwizzle1|IdentitySwizzle2|IdentitySwizzle4|IdentitySwizzle8|BatchedIdentitySwizzle}`. The last one is used for batched or array GEMM.
|
||||
* `{visitor}`: a bool variable indicates whether the epilogue visitor tree is used.
|
||||
|
||||
### GEMM Grouped Operation
|
||||
The GEMM Grouped Operation description can be created with
|
||||
```python
|
||||
operation = GemmOperationGrouped(
|
||||
compute_capability, tile_description,
|
||||
A, B, C, epilogue_functor,
|
||||
swizzling_functor, {precompute_mode}
|
||||
)
|
||||
```
|
||||
* `{precompute_mode}`: It could be `SchedulerMode.Host` or `SchedulerMode.Device`. See [examples/24_gemm_grouped](examples/24_gemm_grouped) for more details.
|
||||
|
||||
|
||||
### Conv2d Operation
|
||||
The Conv2d Operation description can be created with
|
||||
```python
|
||||
operation = Conv2dOperation(
|
||||
{conv_kind}, {iterator_algorithm},
|
||||
compute_capability, tile_description,
|
||||
A, B, C, {stride_support},
|
||||
epilogue_functor, swizzling_functor
|
||||
)
|
||||
```
|
||||
* `{conv_kind}` defines which convolution is executed. Available options include `fprop`, `dgrad`, and `wgrad`.
|
||||
* `{iterator_algorithm}` specifies the iterator algorithm used by the implicit GEMM in convolution. The options are as follows:
|
||||
* `analytic`: functionally correct in all cases but lower performance
|
||||
* `optimized`: optimized for R <= 32, S <= 32 and unity-stride dgrad
|
||||
* `fixed_channels`: analytic algorithm optimized for fixed channel count (C == AccessSize)
|
||||
* `few_channels`: Analytic algorithm optimized for few channels (C divisible by AccessSize)
|
||||
* `{stride_support}`: distinguishes among partial specializations that accelerate certain problems where convolution
|
||||
stride is unit.
|
||||
* `strided`: arbitrary convolution stride
|
||||
* `unity`: unit convolution stride
|
||||
|
||||
***
|
||||
## Code Emission and Compilation
|
||||
After implementing the operation description, the related host and device code can be compiled with
|
||||
```python
|
||||
import pycutlass
|
||||
|
||||
pycutlass.compiler.add_module([operation,])
|
||||
```
|
||||
Several operations can be compiled together. The `nvcc` at `$CUDA_INSTALL_PATH/bin` is used by default as the compiler backend. But you can also switch to [CUDA Python](https://nvidia.github.io/cuda-python/overview.html)'s `nvrtc` with
|
||||
```python
|
||||
pycutlass.compiler.nvrtc()
|
||||
```
|
||||
We also have an internal compiled artifact manager that caches the compiled kernel in both memory and disk. The `compiled_cache.db` at your workspace is the database that contains the binary files. You can delete the file if you want to recompile the kernels.
|
||||
***
|
||||
## Argument Processing
|
||||
We provide argument wrapper to convert python tensors to the kernel parameters. Currently it supports [torch.Tensor](https://pytorch.org/), [numpy.ndarray](https://numpy.org/), and [cupy.ndarray](https://cupy.dev/).
|
||||
### GEMM Arguments
|
||||
The Gemm arguments can be created with
|
||||
```python
|
||||
arguments = GemmArguments(
|
||||
operation=operation, problem_size={problem_size},
|
||||
A={tensor_A}, B={tensor_B}, C={tensor_C}, D={tensor_D},
|
||||
output_op={output_op},
|
||||
gemm_mode={gemm_mode},
|
||||
split_k_slices={split_k_slices}, batch={batch}
|
||||
)
|
||||
```
|
||||
* `problem_size` is a `cutlass.gemm.GemmCoord(M, N, K)` object that defines $M\times N\times K$ matrix multiplication.
|
||||
* `tensor_X`: user-provide tensors.
|
||||
* `output_op`: the params for the epilogue functor.
|
||||
* `gemm_mode`, `split_k_slices`, and `batch`:
|
||||
|
||||
|gemm_mode| split_k_slices | batch | remark|
|
||||
|--|--|--|--|
|
||||
|cutlass.gemm.Mode.Gemm | number of split-K slices | - | the ordinary GEMM or GEMM with serial split-K|
|
||||
|cutlass.gemm.Mode.GemmSplitKParallel | number of split-K slices | - | GEMM Split-K Parallel|
|
||||
|cutlass.gemm.Mode.Batched | - | batch size | Batched GEMM |
|
||||
|cutlass.gemm.Mode.Array | - | batch size | Array GEMM |
|
||||
|
||||
### GEMM Grouped Arguments
|
||||
The GEMM grouped arguments can be created with
|
||||
```python
|
||||
arguments = GemmGroupedArguments(
|
||||
operation, {problem_sizes_coord}, {tensor_As}, {tensor_Bs}, {tensor_Cs}, {tensor_Ds},
|
||||
output_op=output_op)
|
||||
)
|
||||
```
|
||||
* `problem_size_coord` is a list of `cutlass.gemm.GemmCoord(M, N, K)` for each problem size.
|
||||
* `tensor_Xs` is a list of user-provide tensors.
|
||||
* `output_op`: the params of the epilogue functor
|
||||
|
||||
### Conv2d Arguments
|
||||
The Conv2d arguments can be created with
|
||||
```python
|
||||
arguments = Conv2dArguments(
|
||||
operation, {problem_size}, {tensor_A},
|
||||
{tensor_B}, {tensor_C}, {tensor_D},
|
||||
{output_op},
|
||||
{split_k_mode},
|
||||
{split_k_slices}
|
||||
)
|
||||
```
|
||||
* `problem_size`: it can be constructed with
|
||||
```python
|
||||
problem_size = cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(N, H, W, C),
|
||||
cutlass.Tensor4DCoord(K, R, S, C),
|
||||
cutlass.Tensor4DCoord(pad[0], pad[1], pad[2], pad[3]),
|
||||
cutlass.MatrixCoord(stride[0], stride[1]),
|
||||
cutlass.MatrixCoord(dilation[0], dilation[1]),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
split_k_slices, 1
|
||||
)
|
||||
```
|
||||
* `tensor_X` are user-provide tensors
|
||||
* `output_op`: the params of the epilogue functor
|
||||
* `split_k_mode`: currently we support `cutlass.conv.SplitKMode.Serial` and `cutlass.conv.SplitKMode.Parallel`.
|
||||
* `split_k_slice`: number of split-k slices
|
||||
|
||||
For ordinary conv2d, just use `cutlass.conv.SplitKMode.Serial` with `split_k_slice=1`.
|
||||
|
||||
### Getting output_op
|
||||
The way to create output_op is listed below
|
||||
```python
|
||||
output_op = operation.epilogue_type(*([alpha, beta] + args.activation_args)),
|
||||
```
|
||||
It is a list of arguments start with the scaling factor `alpha` and `beta`.
|
||||
The `output_op` of EpilogueVisitorTree is slightly different. Please check [EpilogueVisitorTree](tools/library/scripts/pycutlass/docs/source/md/EpilogueVisitorTree.md) for details.
|
||||
|
||||
|
||||
## Kernel Launching
|
||||
|
||||
With the arguments and operations, the kernel can be launched simply with
|
||||
```python
|
||||
operation.run(arguments)
|
||||
```
|
||||
|
||||
## Sync results
|
||||
|
||||
We also provide function to synchronize the kernel execution. If you use `numpy`, it will also copy the result back to host. To do that, run
|
||||
```python
|
||||
arguments.sync()
|
||||
```
|
||||
If you use EpilogueVisitorTree, please call
|
||||
```python
|
||||
output_op.sync()
|
||||
```
|
||||
|
||||
## Reduction Kernel behind Parallel Split-K
|
||||
|
||||
If you use parallel-split-K in GEMM or Conv2d, an additional reduction kernel is required. Please check [examples/40_cutlass_py](examples/40_cutlass_py) for detail.
|
||||
@ -1,4 +0,0 @@
|
||||
User Guide
|
||||
=====================================
|
||||
|
||||
.. mdinclude:: ./md/basic_idea.md
|
||||
@ -1,4 +0,0 @@
|
||||
User Guide
|
||||
=====================================
|
||||
|
||||
.. mdinclude:: ./md/EpilogueVisitorTree.md
|
||||
@ -1,106 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
from pycutlass import *
|
||||
import pycutlass
|
||||
from pycutlass.epilogue import LinearCombination
|
||||
from pycutlass.test.conv2d_testbed import Conv2dLauncher
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pycutlass.get_memory_pool(2**33, 2**33)
|
||||
pycutlass.compiler.nvcc()
|
||||
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float32,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=4,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(cutlass.float32, 4, cutlass.float32, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
element_epilogue=cutlass.float32, stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
profiler = Conv2dLauncher(operation, verification=False, profiling=True)
|
||||
|
||||
python_runtime = profiler.run(
|
||||
problem_size = cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(32, 224, 224, 128),
|
||||
cutlass.Tensor4DCoord(128, 3, 3, 128),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
), split_k_mode=cutlass.conv.SplitKMode.Serial
|
||||
)
|
||||
|
||||
|
||||
cpp_runtime = profiler.run_cutlass_profiler(
|
||||
problem_size = cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(32, 224, 224, 128),
|
||||
cutlass.Tensor4DCoord(128, 3, 3, 128),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
), split_k_mode=cutlass.conv.SplitKMode.Serial
|
||||
)
|
||||
|
||||
print(cpp_runtime / python_runtime)
|
||||
@ -1,91 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import pycutlass
|
||||
from pycutlass import *
|
||||
from pycutlass.test import *
|
||||
from pycutlass.test.gemm_testbed import GemmUniversalLauncher
|
||||
|
||||
if __name__ == '__main__':
|
||||
pycutlass.get_memory_pool(2**32, 2**32)
|
||||
pycutlass.compiler.nvcc()
|
||||
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[256, 128, 32],
|
||||
stages=3, warp_count=[4, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=cutlass.float16, layout=cutlass.RowMajor,
|
||||
alignment=4
|
||||
)
|
||||
B = TensorDescription(
|
||||
element=cutlass.float16, layout=cutlass.RowMajor,
|
||||
alignment=4
|
||||
)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float32, layout=cutlass.ColumnMajor,
|
||||
alignment=4
|
||||
)
|
||||
|
||||
element_epilogue = cutlass.float32
|
||||
|
||||
epilogue_functor = LinearCombination(cutlass.float32, 4, cutlass.float32, cutlass.float32)
|
||||
|
||||
swizzling_functor = cutlass.IdentitySwizzle1
|
||||
|
||||
operation = GemmOperationUniversal(
|
||||
arch=80, tile_description=tile_description,
|
||||
A=A, B=B, C=C, element_epilogue=element_epilogue,
|
||||
epilogue_functor=epilogue_functor, swizzling_functor=swizzling_functor
|
||||
)
|
||||
|
||||
profiler = GemmUniversalLauncher(operation, verification=False, profiling=True)
|
||||
python_runtime = profiler.run(
|
||||
mode=cutlass.gemm.Mode.Gemm,
|
||||
problem_size=cutlass.gemm.GemmCoord(4096, 4096, 4096)
|
||||
)
|
||||
|
||||
cpp_runtime = profiler.run_cutlass_profiler(
|
||||
mode=cutlass.gemm.Mode.Gemm,
|
||||
problem_size=cutlass.gemm.GemmCoord(4096, 4096, 4096),
|
||||
)
|
||||
|
||||
print(cpp_runtime / python_runtime)
|
||||
@ -1,9 +0,0 @@
|
||||
[build-system]
|
||||
|
||||
requires = [
|
||||
"setuptools",
|
||||
"scikit-build>0.13.1",
|
||||
"pybind11",
|
||||
"numpy<1.23",
|
||||
"cmake>=3.20.1,!=3.23.0"
|
||||
]
|
||||
@ -1,116 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import distutils.cmd
|
||||
from setuptools import setup
|
||||
import setuptools.command.build_py
|
||||
import os
|
||||
|
||||
# build rmm dependency
|
||||
class BuildRMM(distutils.cmd.Command):
|
||||
user_options = []
|
||||
def initialize_options(self):
|
||||
pass
|
||||
def finalize_options(self):
|
||||
pass
|
||||
def run(self):
|
||||
try:
|
||||
import rmm
|
||||
except ImportError:
|
||||
print("installing rmm")
|
||||
os.system("git clone -b branch-22.10 --recurse-submodules https://github.com/rapidsai/rmm.git")
|
||||
os.chdir("./rmm")
|
||||
os.system("./build.sh librmm rmm")
|
||||
os.chdir("./python")
|
||||
os.system("python setup.py build_ext --inplace")
|
||||
os.system("python setup.py install")
|
||||
|
||||
cutlass_path = os.getenv('CUTLASS_PATH')
|
||||
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
|
||||
cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
|
||||
assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
|
||||
|
||||
ext_modules = []
|
||||
|
||||
try:
|
||||
from pybind11.setup_helpers import Pybind11Extension, build_ext
|
||||
include_dirs = [
|
||||
cutlass_path + "/include",
|
||||
cuda_install_path + "/include",
|
||||
cutlass_path + "/tools/util/include",
|
||||
cutlass_path + "/test",
|
||||
cutlass_path + "/tools/library/scripts/pycutlass/googletest/googletest/include"
|
||||
]
|
||||
|
||||
ext_modules = [
|
||||
Pybind11Extension("cutlass",
|
||||
["src/cpp/cutlass.cpp"],
|
||||
include_dirs=include_dirs,
|
||||
extra_compile_args=["-fpermissive", "-w", "-std=c++17"]),
|
||||
Pybind11Extension("cute",
|
||||
["src/cpp/cute.cpp"],
|
||||
include_dirs=include_dirs,
|
||||
extra_compile_args=["-fpermissive", "-w", "-std=c++17"])
|
||||
]
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
setup(
|
||||
name="PyCutlass",
|
||||
version="0.0.1",
|
||||
author="Zhaodong Chen; Andrew Kerr; Haicheng Wu; Szymon Migacz; Graham Markall",
|
||||
author_email="zhaodongc@nvidia.com",
|
||||
description="Python interface for CUTLASS",
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
],
|
||||
package_dir={"": "src"},
|
||||
packages=['pycutlass', 'pycutlass.utils', 'pycutlass.test'],
|
||||
setup_requires=["pybind11", "numpy<1.23"],
|
||||
install_requires=[
|
||||
"numpy<1.23",
|
||||
'pybind11',
|
||||
'cuda-python>=11.8.0',
|
||||
'typeguard',
|
||||
'bfloat16',
|
||||
'typing',
|
||||
'scikit-build',
|
||||
'treelib'
|
||||
],
|
||||
cmdclass={
|
||||
'rmm': BuildRMM
|
||||
},
|
||||
ext_modules=ext_modules,
|
||||
python_requires=">=3.6",
|
||||
)
|
||||
@ -1,75 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief In-memory compiled artifact cache
|
||||
*/
|
||||
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace cutlass {
|
||||
|
||||
struct CompileCache {
|
||||
public:
|
||||
CompileCache() = default;
|
||||
~CompileCache() = default;
|
||||
|
||||
using Cache = std::unordered_map<std::string, py::object>;
|
||||
|
||||
/// Check if the kernel has already been compiled
|
||||
py::object at(const std::string &kernel) {
|
||||
auto item = cache_.find(kernel);
|
||||
|
||||
if (item != cache_.end()) {
|
||||
return item->second;
|
||||
}
|
||||
return py::none();
|
||||
}
|
||||
|
||||
/// Insert a new compiled kernel for new configuration
|
||||
void insert(const std::string &kernel, const py::object &compiled_kernel){
|
||||
cache_.emplace(kernel, compiled_kernel);
|
||||
}
|
||||
|
||||
const int64_t size() const { return cache_.size(); }
|
||||
|
||||
/// Clear the cache
|
||||
void clear() { cache_.clear(); }
|
||||
|
||||
private:
|
||||
Cache cache_;
|
||||
};
|
||||
|
||||
} // namespace cutlass
|
||||
@ -1,54 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief binding CuTe C++ APIs to Python
|
||||
*/
|
||||
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cute/arch/mma_sm90_gmma.hpp"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
|
||||
PYBIND11_MODULE(cute, m) {
|
||||
|
||||
// module doc
|
||||
m.doc() = "CuTe C++ bindings";
|
||||
|
||||
py::enum_<cute::GMMA::Major>(m, "GMMAMajor",
|
||||
R"pbdoc(classification of CuTe GMMA tensor major specification)pbdoc")
|
||||
.value("K", cute::GMMA::Major::K,
|
||||
R"pbdoc(Tensor is contiguous in reduction dimension)pbdoc")
|
||||
.value("MN", cute::GMMA::Major::MN,
|
||||
R"pbdoc(Tensor is contiguous in non-reduction dimension)pbdoc");
|
||||
}
|
||||
@ -1,182 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief binding CUTLASS C++ APIs to Python
|
||||
*/
|
||||
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "builtin_types.h"
|
||||
#include "device_launch_parameters.h"
|
||||
#include "stddef.h"
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
#include "include/conv/convolution.h"
|
||||
#include "include/gemm/gemm.h"
|
||||
#include "include/types.h"
|
||||
#include "include/layout/layout.h"
|
||||
#include "include/tensor_coord.h"
|
||||
#include "include/arch.h"
|
||||
#include "include/tensor_ref_view.h"
|
||||
#include "include/swizzling.h"
|
||||
#include "test/conv/convolution.h"
|
||||
#include "test/gemm/gemm.h"
|
||||
|
||||
|
||||
// Data Types
|
||||
#include "library.h"
|
||||
|
||||
// compiler
|
||||
#include "compiler.h"
|
||||
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
|
||||
PYBIND11_MODULE(cutlass, m) {
|
||||
|
||||
// module doc
|
||||
m.doc() = "cutlass C++ binding";
|
||||
|
||||
//
|
||||
// Bind data type
|
||||
//
|
||||
bind_cutlass_types(m);
|
||||
|
||||
//
|
||||
// Bind layout
|
||||
//
|
||||
bind_layout(m);
|
||||
|
||||
//
|
||||
// Bind tensor coord
|
||||
//
|
||||
bind_tensor_coord(m);
|
||||
|
||||
//
|
||||
// Bind tensor ref
|
||||
//
|
||||
bind_tensor_refs_and_views(m);
|
||||
|
||||
//
|
||||
// Bind opcode
|
||||
//
|
||||
bind_opcode(m);
|
||||
|
||||
//
|
||||
// Bind convolution
|
||||
//
|
||||
py::module_ conv_submodule = m.def_submodule("conv");
|
||||
bind_convolution(conv_submodule);
|
||||
|
||||
//
|
||||
// Bind gemm
|
||||
//
|
||||
py::module_ gemm_submodule = m.def_submodule("gemm");
|
||||
bind_gemm(gemm_submodule);
|
||||
|
||||
//
|
||||
// Bind swizzling
|
||||
//
|
||||
bind_threadblock_swizzle(m);
|
||||
|
||||
|
||||
//
|
||||
// Bind test units
|
||||
//
|
||||
py::module_ test = m.def_submodule("test");
|
||||
py::module_ test_conv = test.def_submodule("conv");
|
||||
bind_convolution_test(test_conv);
|
||||
|
||||
py::module_ test_gemm = test.def_submodule("gemm");
|
||||
bind_gemm_test(test_gemm);
|
||||
|
||||
// data types
|
||||
py::enum_<cutlass::DataType>(m, "dtype")
|
||||
.value("b1", cutlass::DataType::kB1)
|
||||
.value("u2", cutlass::DataType::kU2)
|
||||
.value("u4", cutlass::DataType::kU4)
|
||||
.value("u8", cutlass::DataType::kU8)
|
||||
.value("u16", cutlass::DataType::kU16)
|
||||
.value("u32", cutlass::DataType::kU32)
|
||||
.value("u64", cutlass::DataType::kU64)
|
||||
.value("s2", cutlass::DataType::kS2)
|
||||
.value("s4", cutlass::DataType::kS4)
|
||||
.value("s16", cutlass::DataType::kS16)
|
||||
.value("s64", cutlass::DataType::kS64)
|
||||
.value("cf16", cutlass::DataType::kCF16)
|
||||
.value("cbf16", cutlass::DataType::kCBF16)
|
||||
.value("cf32", cutlass::DataType::kCF32)
|
||||
.value("ctf32", cutlass::DataType::kCTF32)
|
||||
.value("cf64", cutlass::DataType::kCF64)
|
||||
.value("cs2", cutlass::DataType::kCS2)
|
||||
.value("cs4", cutlass::DataType::kCS4)
|
||||
.value("cs8", cutlass::DataType::kCS8)
|
||||
.value("cs16", cutlass::DataType::kCS16)
|
||||
.value("cs32", cutlass::DataType::kCS32)
|
||||
.value("cs64", cutlass::DataType::kCS64)
|
||||
.value("cu2", cutlass::DataType::kCU2)
|
||||
.value("cu4", cutlass::DataType::kCU4)
|
||||
.value("cu8", cutlass::DataType::kCU8)
|
||||
.value("cu16", cutlass::DataType::kCU16)
|
||||
.value("cu32", cutlass::DataType::kCU32)
|
||||
.value("cu64", cutlass::DataType::kCU64)
|
||||
.value("invalid", cutlass::DataType::kInvalid);
|
||||
|
||||
// layout types
|
||||
py::enum_<cutlass::LayoutType>(m, "layout")
|
||||
.value("ColumnMajorInterleaved2", cutlass::LayoutType::kColumnMajorInterleaved2)
|
||||
.value("RowMajorInterleaved2", cutlass::LayoutType::kRowMajorInterleaved2)
|
||||
.value("ColumnMajorInterleaved64", cutlass::LayoutType::kColumnMajorInterleaved64)
|
||||
.value("RowMajorInterleaved64", cutlass::LayoutType::kRowMajorInterleaved64)
|
||||
.value("TensorNDHWC", cutlass::LayoutType::kTensorNDHWC)
|
||||
.value("TensorNCHW", cutlass::LayoutType::kTensorNCHW)
|
||||
.value("TensorNGHWC", cutlass::LayoutType::kTensorNGHWC)
|
||||
.value("TensorNC64HW64", cutlass::LayoutType::kTensorNC64HW64)
|
||||
.value("TensorC64RSK64", cutlass::LayoutType::kTensorC64RSK64);
|
||||
|
||||
// transform types
|
||||
py::enum_<cutlass::ComplexTransform>(m, "complex_transform")
|
||||
.value("none", cutlass::ComplexTransform::kNone)
|
||||
.value("conj", cutlass::ComplexTransform::kConjugate);
|
||||
|
||||
//
|
||||
// Compiler
|
||||
//
|
||||
py::class_<cutlass::CompileCache>(m, "CompileCache")
|
||||
.def(py::init<>())
|
||||
.def("at", &cutlass::CompileCache::at)
|
||||
.def("insert", &cutlass::CompileCache::insert)
|
||||
.def("size", &cutlass::CompileCache::size)
|
||||
.def("clear", &cutlass::CompileCache::clear);
|
||||
|
||||
}
|
||||
@ -1,59 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind opcode classes to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cutlass/arch/mma.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace cutlass {
|
||||
enum class OpcodeClass {
|
||||
kSimt, kTensorOp, kWmmaTensorOp, kSparseTensorOp
|
||||
};
|
||||
}
|
||||
|
||||
void bind_opcode(py::module &m) {
|
||||
py::enum_<cutlass::OpcodeClass>(m, "OpClass",
|
||||
R"pbdoc(classification of math operators)pbdoc")
|
||||
.value("Simt", cutlass::OpcodeClass::kSimt,
|
||||
R"pbdoc(Tag classifying math operators as thread-level operations)pbdoc")
|
||||
.value("TensorOp", cutlass::OpcodeClass::kTensorOp,
|
||||
R"pbdoc(Tag classifying operators as Tensor Core operations)pbdoc")
|
||||
.value("WmmaTensorOp", cutlass::OpcodeClass::kWmmaTensorOp,
|
||||
R"pbdoc(Tag classifying operators as WMMA Tensor Core operations)pbdoc")
|
||||
.value("SparseTensorOp", cutlass::OpcodeClass::kSparseTensorOp,
|
||||
R"pbdoc(Tag classifying operators as sparseTensor Core operations)pbdoc");
|
||||
}
|
||||
@ -1,102 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind Convolution problem sizes to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cutlass/conv/conv2d_problem_size.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
void bind_conv_problem_size(py::module &m) {
|
||||
//
|
||||
// Conv2d Problem Size:
|
||||
// include/cutlass/conv/conv2d_problem_size.h
|
||||
//
|
||||
py::class_<cutlass::conv::Conv2dProblemSize>(m, "Conv2dProblemSize")
|
||||
// constructors
|
||||
.def(py::init<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, cutlass::conv::Mode, int, int>())
|
||||
.def(py::init<cutlass::Tensor4DCoord, cutlass::Tensor4DCoord, cutlass::Tensor4DCoord, cutlass::MatrixCoord, cutlass::MatrixCoord, cutlass::conv::Mode, int, int>())
|
||||
// attribute accessors
|
||||
.def_readwrite("N", &cutlass::conv::Conv2dProblemSize::N)
|
||||
.def_readwrite("H", &cutlass::conv::Conv2dProblemSize::H)
|
||||
.def_readwrite("W", &cutlass::conv::Conv2dProblemSize::W)
|
||||
.def_readwrite("C", &cutlass::conv::Conv2dProblemSize::C)
|
||||
.def_readwrite("P", &cutlass::conv::Conv2dProblemSize::P)
|
||||
.def_readwrite("Q", &cutlass::conv::Conv2dProblemSize::Q)
|
||||
.def_readwrite("K", &cutlass::conv::Conv2dProblemSize::K)
|
||||
.def_readwrite("R", &cutlass::conv::Conv2dProblemSize::R)
|
||||
.def_readwrite("S", &cutlass::conv::Conv2dProblemSize::S)
|
||||
.def_readwrite("pad_h", &cutlass::conv::Conv2dProblemSize::pad_h)
|
||||
.def_readwrite("pad_w", &cutlass::conv::Conv2dProblemSize::pad_w)
|
||||
.def_readwrite("stride_h", &cutlass::conv::Conv2dProblemSize::stride_h)
|
||||
.def_readwrite("stride_w", &cutlass::conv::Conv2dProblemSize::stride_w)
|
||||
.def_readwrite("dilation_h", &cutlass::conv::Conv2dProblemSize::dilation_h)
|
||||
.def_readwrite("dilation_w", &cutlass::conv::Conv2dProblemSize::dilation_w)
|
||||
.def_readwrite("mode", &cutlass::conv::Conv2dProblemSize::mode)
|
||||
.def_readwrite("split_k_slices", &cutlass::conv::Conv2dProblemSize::split_k_slices)
|
||||
.def_readwrite("groups", &cutlass::conv::Conv2dProblemSize::groups)
|
||||
// functions
|
||||
.def("reset_split_k_slices", &cutlass::conv::Conv2dProblemSize::reset_split_k_slices)
|
||||
.def("activation_extent", &cutlass::conv::Conv2dProblemSize::activation_extent)
|
||||
.def("filter_extent", &cutlass::conv::Conv2dProblemSize::filter_extent)
|
||||
.def("output_extent", &cutlass::conv::Conv2dProblemSize::output_extent)
|
||||
.def("activation_size", &cutlass::conv::Conv2dProblemSize::activation_size)
|
||||
.def("filter_size", &cutlass::conv::Conv2dProblemSize::filter_size)
|
||||
.def("output_size", &cutlass::conv::Conv2dProblemSize::output_size);
|
||||
|
||||
// Get tensor size
|
||||
m.def("implicit_gemm_tensor_a_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_a_size));
|
||||
m.def("implicit_gemm_tensor_b_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_b_size));
|
||||
m.def("implicit_gemm_tensor_c_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&>(&cutlass::conv::implicit_gemm_tensor_c_size));
|
||||
|
||||
// Get tensor extent
|
||||
m.def("implicit_gemm_tensor_a_extent",
|
||||
py::overload_cast<
|
||||
cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
|
||||
>(&cutlass::conv::implicit_gemm_tensor_a_extent));
|
||||
|
||||
m.def("implicit_gemm_tensor_b_extent",
|
||||
py::overload_cast<
|
||||
cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
|
||||
>(&cutlass::conv::implicit_gemm_tensor_b_extent));
|
||||
|
||||
m.def("implicit_gemm_tensor_c_extent",
|
||||
py::overload_cast<
|
||||
cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&
|
||||
>(&cutlass::conv::implicit_gemm_tensor_c_extent));
|
||||
|
||||
m.def("implicit_gemm_problem_size", py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize &>(&cutlass::conv::implicit_gemm_problem_size));
|
||||
|
||||
}
|
||||
@ -1,91 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind convolution related enum types to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "conv_problem_size.h"
|
||||
#include "host.h"
|
||||
#include "cutlass/conv/convolution.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
void bind_convolution(py::module &m) {
|
||||
//
|
||||
// Enumerate types
|
||||
// cutlass/include/cutlass/conv/convolution.h
|
||||
//
|
||||
|
||||
/// Convolutional operator
|
||||
py::enum_<cutlass::conv::Operator>(m, "Operator", R"pbdoc(Convolutional operator)pbdoc")
|
||||
.value("fprop", cutlass::conv::Operator::kFprop, "Forward propagation")
|
||||
.value("dgrad", cutlass::conv::Operator::kDgrad, "Activation grad")
|
||||
.value("wgrad", cutlass::conv::Operator::kWgrad, "Weight grad");
|
||||
|
||||
/// Distinguishes convolution from cross correlation
|
||||
py::enum_<cutlass::conv::Mode>(m, "Mode")
|
||||
.value("cross_correlation", cutlass::conv::Mode::kCrossCorrelation)
|
||||
.value("convolution", cutlass::conv::Mode::kConvolution);
|
||||
|
||||
/// Selects among several implementation variants trading off performance with simplicity
|
||||
py::enum_<cutlass::conv::IteratorAlgorithm>(m, "IteratorAlgorithm",
|
||||
R"pbdoc(Selects among several implementation variants trading off performance with simplicity)pbdoc")
|
||||
.value("analytic", cutlass::conv::IteratorAlgorithm::kAnalytic, R"pbdoc(functionally correct in all cases but lower performance)pbdoc")
|
||||
.value("optimized", cutlass::conv::IteratorAlgorithm::kOptimized, R"pbdoc(optimized for R <= 32, S <= 32 and unity-stride dgrad)pbdoc")
|
||||
.value("fixed_channels", cutlass::conv::IteratorAlgorithm::kFixedChannels, R"pbdoc(Analytic algorithm optimized for fixed channel count (C == AccessSize))pbdoc")
|
||||
.value("few_channels", cutlass::conv::IteratorAlgorithm::kFewChannels, R"pbdoc(Analytic algorithm optimized for few channels (C divisible by AccessSize))pbdoc");
|
||||
|
||||
/// Distinguishes among partial specializations that accelerate certain problems where convolution
|
||||
/// stride is unit.
|
||||
py::enum_<cutlass::conv::StrideSupport>(m, "StrideSupport",
|
||||
R"pbdoc(Distinguishes among partial specializations that accelerate certain problems where convolution
|
||||
stride is unit.)pbdoc")
|
||||
.value("strided", cutlass::conv::StrideSupport::kStrided, R"pbdoc(arbitrary convolution stride)pbdoc")
|
||||
.value("unity", cutlass::conv::StrideSupport::kUnity, R"pbdoc(unit convolution stride)pbdoc");
|
||||
|
||||
/// Identifies split-K mode
|
||||
py::enum_<cutlass::conv::SplitKMode>(m, "SplitKMode")
|
||||
.value("None", cutlass::conv::SplitKMode::kNone)
|
||||
.value("Serial", cutlass::conv::SplitKMode::kSerial)
|
||||
.value("Parallel", cutlass::conv::SplitKMode::kParallel);
|
||||
|
||||
// Conv problem sizes
|
||||
bind_conv_problem_size(m);
|
||||
|
||||
//
|
||||
// host helper functions
|
||||
//
|
||||
py::module_ host_submodule = m.def_submodule("host");
|
||||
bind_conv_host_helper(host_submodule);
|
||||
}
|
||||
@ -1,54 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind conv host helpers to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cutlass/util/host_reorder.h"
|
||||
#include "cutlass/layout/tensor.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
|
||||
void bind_conv_host_helper(py::module &m) {
|
||||
|
||||
/// reorder operand B for interleaved layout
|
||||
m.def("reorder_convK", [](
|
||||
cutlass::TensorRef<int8_t, cutlass::layout::TensorCxRSKx<32>> dest,
|
||||
cutlass::TensorRef<int8_t, cutlass::layout::TensorCxRSKx<32>> src,
|
||||
cutlass::conv::Operator conv_op, const cutlass::conv::Conv2dProblemSize & problem_size) {
|
||||
cutlass::gemm::GemmCoord implicit_problem_size = cutlass::conv::implicit_gemm_problem_size(conv_op, problem_size);
|
||||
cutlass::reorder_convK<32>(dest, src, implicit_problem_size);
|
||||
});
|
||||
}
|
||||
@ -1,222 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A generic wrapper around an epilogue visitor operation
|
||||
*/
|
||||
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/arch/memory.h"
|
||||
#include "cutlass/arch/memory_sm75.h"
|
||||
#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
|
||||
#include "cutlass/gemm/kernel/default_gemm.h"
|
||||
#include "cutlass/gemm/kernel/default_gemm_complex.h"
|
||||
#include "cutlass/gemm/device/default_gemm_configuration.h"
|
||||
#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
|
||||
|
||||
#include "epilogue_visitor_op/visitor_op_linear_combination.h"
|
||||
#include "epilogue_visitor_op/visitor_op_tensor_input.h"
|
||||
#include "epilogue_visitor_op/visitor_op_accumulator.h"
|
||||
#include "epilogue_visitor_op/visitor_op_row_broadcast.h"
|
||||
#include "epilogue_visitor_op/visitor_op_tensor_output.h"
|
||||
#include "epilogue_visitor_op/visitor_op_column_reduction.h"
|
||||
#include "epilogue_visitor_op/visitor_op_row_reduction.h"
|
||||
#include "epilogue_visitor_op/visitor_op_column_broadcast.h"
|
||||
#include "epilogue_visitor_op/visitor_op_unary.h"
|
||||
#include "epilogue_visitor_op/visitor_op_binary.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace epilogue {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Generic Epilogue Visitor.
|
||||
template <
|
||||
typename OutputOp_
|
||||
>
|
||||
class EpilogueVisitorGeneric {
|
||||
public:
|
||||
|
||||
using OutputOp = OutputOp_;
|
||||
using AccumulatorAccessType = typename OutputOp::AccumulatorAccessType;
|
||||
static int const kElementsPerAccess = OutputOp::kElementsPerAccess;
|
||||
using ElementOutput = typename OutputOp::ElementOutput;
|
||||
using OutputTileIterator = typename OutputOp::OutputTileIterator;
|
||||
|
||||
static int const kIterations = OutputTileIterator::kIterations;
|
||||
|
||||
///
|
||||
/// End Epilogue Tree
|
||||
///
|
||||
|
||||
/// Additional SMEM bufer is not required in the broadcast epilogue visitor
|
||||
struct SharedStorage {
|
||||
|
||||
typename OutputOp::SharedStorage output_smem;
|
||||
CUTLASS_HOST_DEVICE
|
||||
SharedStorage() { }
|
||||
};
|
||||
|
||||
public:
|
||||
|
||||
/// Argument structure
|
||||
struct Arguments {
|
||||
typename OutputOp::Arguments output_op_args;
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
Arguments() { }
|
||||
|
||||
Arguments(
|
||||
typename OutputOp::Arguments output_op_args
|
||||
):
|
||||
output_op_args(output_op_args)
|
||||
{
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
struct Params {
|
||||
typename OutputOp::Params output_op_params;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params() { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args):
|
||||
output_op_params(args.output_op_args)
|
||||
{
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
private:
|
||||
|
||||
OutputOp output_op;
|
||||
|
||||
public:
|
||||
|
||||
/// Constructor
|
||||
CUTLASS_DEVICE
|
||||
EpilogueVisitorGeneric(
|
||||
Params const ¶ms, ///< Parameters routed to the epilogue
|
||||
SharedStorage &shared_storage, ///< Shared storage needed by the functors here
|
||||
MatrixCoord threadblock_offset,
|
||||
gemm::GemmCoord threadblock_tile_offset,
|
||||
int thread_idx,
|
||||
MatrixCoord problem_size
|
||||
):
|
||||
output_op(params.output_op_params, shared_storage.output_smem, thread_idx, threadblock_offset, problem_size)
|
||||
{ }
|
||||
|
||||
/// Helper to indicate split-K behavior
|
||||
CUTLASS_DEVICE
|
||||
void set_k_partition(
|
||||
int split_k_index, ///< Index of this threadblock within split-K partitioned scheme
|
||||
int split_k_slices) { ///< Total number of split-K slices
|
||||
|
||||
}
|
||||
|
||||
/// Called to set the batch index
|
||||
CUTLASS_DEVICE
|
||||
void set_batch_index(int batch_idx) {
|
||||
output_op.set_batch_index(batch_idx);
|
||||
}
|
||||
|
||||
/// Called at the start of the epilogue just before iterating over accumulator slices
|
||||
CUTLASS_DEVICE
|
||||
void begin_epilogue() {
|
||||
output_op.begin_epilogue();
|
||||
}
|
||||
|
||||
/// Called at the start of one step before starting accumulator exchange
|
||||
CUTLASS_DEVICE
|
||||
void begin_step(int step_idx) {
|
||||
output_op.begin_step(step_idx);
|
||||
}
|
||||
|
||||
/// Called at the start of a row
|
||||
CUTLASS_DEVICE
|
||||
void begin_row(int row_idx) {
|
||||
output_op.begin_row(row_idx);
|
||||
}
|
||||
|
||||
/// Called after accumulators have been exchanged for each accumulator vector
|
||||
CUTLASS_DEVICE
|
||||
void visit(
|
||||
int iter_idx,
|
||||
int row_idx,
|
||||
int column_idx,
|
||||
int frag_idx,
|
||||
AccumulatorAccessType const &accum) {
|
||||
output_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
|
||||
}
|
||||
|
||||
/// Called at the start of a row
|
||||
CUTLASS_DEVICE
|
||||
void end_row(int row_idx) {
|
||||
output_op.end_row(row_idx);
|
||||
|
||||
}
|
||||
|
||||
/// Called after all accumulator elements have been visited
|
||||
CUTLASS_DEVICE
|
||||
void end_step(int step_idx) {
|
||||
output_op.end_step(step_idx);
|
||||
}
|
||||
|
||||
/// Called after all steps have been completed
|
||||
CUTLASS_DEVICE
|
||||
void end_epilogue() {
|
||||
output_op.end_epilogue();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace threadblock
|
||||
} // namespace epilogue
|
||||
} // namespace cutlass
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,84 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A file contains the binary ops
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
/// Scalar multiplication
|
||||
template <typename T, int N>
|
||||
struct VectorAdd {
|
||||
|
||||
struct Arguments {
|
||||
int tmp;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments():tmp(0){ }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(int tmp): tmp(tmp) { }
|
||||
};
|
||||
|
||||
struct Params {
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args) { }
|
||||
};
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
VectorAdd(
|
||||
Params const ¶ms
|
||||
) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
|
||||
cutlass::plus<Array<T, N>> add_op;
|
||||
return add_op(lhs, rhs);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,233 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A file contains the unary ops
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/epilogue/thread/activation.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
/// Scalar multiplication
|
||||
template <typename T, int N>
|
||||
struct Mult {
|
||||
|
||||
struct Arguments {
|
||||
T alpha;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments():alpha(T(1.0)){ }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(T alpha): alpha(alpha) { }
|
||||
};
|
||||
|
||||
struct Params {
|
||||
T alpha; ///< scales accumulators
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params():alpha(T(1.0)){ }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args): alpha(args.alpha) { }
|
||||
};
|
||||
|
||||
T alpha_;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Mult(
|
||||
Params const ¶ms
|
||||
):
|
||||
alpha_(params.alpha)
|
||||
{ }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Array<T, N> operator()(Array<T, N> const &source) const {
|
||||
cutlass::multiplies<Array<T, N>> multiply_op;
|
||||
return multiply_op(source, alpha_);
|
||||
}
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
bool guard() {
|
||||
return alpha_ != T(0);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
/// ReLU
|
||||
template <typename T, int N>
|
||||
struct ReLUVisitor {
|
||||
struct Arguments {
|
||||
T threshold;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments():threshold(T(0.0)) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(T threshold): threshold(threshold) { }
|
||||
};
|
||||
|
||||
struct Params {
|
||||
T threshold;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params():threshold(T(0.0)) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args): threshold(args.threshold) { }
|
||||
};
|
||||
|
||||
T threshold_;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
ReLUVisitor(Params const ¶ms):
|
||||
threshold_(params.threshold) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Array<T, N> operator()(Array<T, N> const &frag) const {
|
||||
maximum<Array<T, N>> mx;
|
||||
return mx(frag, threshold_);
|
||||
}
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
bool guard() {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/// leakyReLU
|
||||
template <typename T, int N>
|
||||
struct LeakyReLUVisitor {
|
||||
struct Arguments {
|
||||
T leaky_alpha;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments():leaky_alpha(T(0.0)) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(T leaky_alpha): leaky_alpha(leaky_alpha) { }
|
||||
};
|
||||
|
||||
struct Params {
|
||||
T leaky_alpha;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params():leaky_alpha(T(0.0)) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args): leaky_alpha(args.leaky_alpha) { }
|
||||
};
|
||||
|
||||
T leaky_alpha_;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
LeakyReLUVisitor(Params const ¶ms):
|
||||
leaky_alpha_(params.leaky_alpha) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Array<T, N> operator()(Array<T, N> const &frag) const {
|
||||
cutlass::epilogue::thread::LeakyReLU<Array<T, N>> leaky_op;
|
||||
return leaky_op(frag, leaky_alpha_);
|
||||
}
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
bool guard() {
|
||||
return true;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/// Tanh
|
||||
template <typename T, int N>
|
||||
struct TanhVisitor {
|
||||
/// Argument
|
||||
struct Arguments {
|
||||
// a placeholder argument to ensure correctness of ctypes
|
||||
int tmp;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(): tmp(0) { };
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(int tmp): tmp(tmp) { };
|
||||
};
|
||||
|
||||
/// Param
|
||||
struct Params {
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(){ };
|
||||
Params(Arguments const &args) { }
|
||||
};
|
||||
|
||||
/// Constructor
|
||||
CUTLASS_HOST_DEVICE
|
||||
TanhVisitor(Params const ¶ms) { }
|
||||
|
||||
// scalar operator
|
||||
CUTLASS_HOST_DEVICE
|
||||
T tanh_op(T const &scalar) const {
|
||||
return fast_tanh(scalar);
|
||||
}
|
||||
|
||||
/// vector operator
|
||||
CUTLASS_HOST_DEVICE
|
||||
Array<T, N> operator()(Array<T, N> const &frag) const {
|
||||
Array<T, N> y;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i=0; i < N; ++i) {
|
||||
y[i] = tanh_op(frag[i]);
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
bool guard() {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,148 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A file contains the epilogue visitor Op with accumulator
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace epilogue {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Epilogue Visitor operator for the following Computation
|
||||
///
|
||||
/// ElementAccumulator accum;
|
||||
/// return accum;
|
||||
///
|
||||
/// It can only be the leaf node of the epilogue tree
|
||||
|
||||
template <
|
||||
typename ElementAccumulator_, ///< Data type of the Accumulator
|
||||
int kElementsPerAccess_ ///< Number of elements computed per operation
|
||||
>
|
||||
class VisitorOpAccumulator{
|
||||
public:
|
||||
using ElementAccumulator = ElementAccumulator_;
|
||||
static int const kElementsPerAccess = kElementsPerAccess_;
|
||||
|
||||
/// Fragment type for Accumulator
|
||||
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
|
||||
|
||||
/// Fragment type returned by this visitor
|
||||
using VisitAccessType = AccumulatorAccessType;
|
||||
|
||||
/// SMEM buffer class required in the epilogue visitor
|
||||
struct SharedStorage {
|
||||
CUTLASS_HOST_DEVICE
|
||||
SharedStorage() {}
|
||||
};
|
||||
|
||||
/// Host-constructable Arguments structure
|
||||
struct Arguments {
|
||||
// Note: it is strange that ctypes will return issue with empty arguments
|
||||
int tmp;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments() { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(int tmp): tmp(tmp) { }
|
||||
};
|
||||
|
||||
/// Parameter structure
|
||||
struct Params {
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params() { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args) { }
|
||||
};
|
||||
|
||||
public:
|
||||
/// Constructs the function object
|
||||
CUTLASS_HOST_DEVICE
|
||||
VisitorOpAccumulator(
|
||||
Params const ¶ms,
|
||||
SharedStorage &shared_storage,
|
||||
int thread_idx,
|
||||
MatrixCoord threadblock_offset,
|
||||
MatrixCoord problem_size
|
||||
) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void set_batch_index(int batch_idx) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_epilogue() { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_step(int step_idx) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_row(int row_idx) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
VisitAccessType visit(
|
||||
int iter_idx,
|
||||
int row_idx,
|
||||
int column_idx,
|
||||
int frag_idx,
|
||||
AccumulatorAccessType const &accum
|
||||
) {
|
||||
return accum;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_row(int row_idx) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_step(int step_idx) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_epilogue() { }
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,245 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A file contains the epilogue visitor Op with Binary op
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "binary_ops.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace epilogue {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Epilogue Visitor operator for the following computation:
|
||||
///
|
||||
/// ElementCompute alpha;
|
||||
/// ElementCompute beta;
|
||||
/// ElementCompute C = BinaryOp(alpha * ElementCompute(Visitor_A), beta * ElementCompute(Visitor_B)
|
||||
/// Return C;
|
||||
///
|
||||
template <
|
||||
typename ElementAccumulator_, ///< Data type of the Accumulator
|
||||
typename ElementCompute_, ///< Data type used to compute linear combination
|
||||
int kElementsPerAccess_, ///< Number of elements computed per operation
|
||||
typename VisitorA_, ///< Child node A
|
||||
typename VisitorB_, ///< Child node B
|
||||
template<typename T, int N> typename BinaryOp_
|
||||
>
|
||||
class VisitorOpBinary{
|
||||
public:
|
||||
using ElementAccumulator = ElementAccumulator_;
|
||||
using ElementCompute = ElementCompute_;
|
||||
static int const kElementsPerAccess = kElementsPerAccess_;
|
||||
|
||||
using VisitorA = VisitorA_;
|
||||
using VisitorB = VisitorB_;
|
||||
|
||||
/// Fragment type returned from VisitorA.visit
|
||||
using VisitAccessTypeA = typename VisitorA::VisitAccessType;
|
||||
using ElementA = typename VisitAccessTypeA::Element;
|
||||
|
||||
/// Fragment type returned from VisitorB.visit
|
||||
using VisitAccessTypeB = typename VisitorB::VisitAccessType;
|
||||
using ElementB = typename VisitAccessTypeB::Element;
|
||||
|
||||
/// Fragment type returned by this visitor
|
||||
using VisitAccessType = Array<ElementCompute, kElementsPerAccess>;
|
||||
|
||||
/// Fragment type of accumulator
|
||||
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
|
||||
|
||||
using BinaryOp = BinaryOp_<ElementCompute, kElementsPerAccess>;
|
||||
|
||||
static_assert(kElementsPerAccess==VisitAccessTypeA::kElements, "kElementsPerAccess mismatches with Visitor A");
|
||||
static_assert(kElementsPerAccess==VisitAccessTypeB::kElements, "kElementsPerAccess mismatches with Visitor B");
|
||||
|
||||
/// SMEM buffer class required in the epilogue visitor
|
||||
struct SharedStorage {
|
||||
typename VisitorA::SharedStorage storage_a;
|
||||
typename VisitorB::SharedStorage storage_b;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
SharedStorage() {}
|
||||
};
|
||||
|
||||
|
||||
/// Host-constructable Arguments structure
|
||||
struct Arguments {
|
||||
typename BinaryOp::Arguments binary_arg;
|
||||
typename VisitorA::Arguments visitor_a_arg; ///< Argument type for visitor_a
|
||||
typename VisitorB::Arguments visitor_b_arg; ///< Argument type for visitor_b
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments():binary_arg() { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(
|
||||
typename BinaryOp::Arguments binary_arg,
|
||||
typename VisitorA::Arguments visitor_a_arg,
|
||||
typename VisitorB::Arguments visitor_b_arg
|
||||
):
|
||||
binary_arg(binary_arg),
|
||||
visitor_a_arg(visitor_a_arg),
|
||||
visitor_b_arg(visitor_b_arg)
|
||||
{ }
|
||||
};
|
||||
|
||||
/// Parameter structure
|
||||
struct Params {
|
||||
typename BinaryOp::Params binary_param;
|
||||
typename VisitorA::Params visitor_a_param; ///< Argument type for visitor_a
|
||||
typename VisitorB::Params visitor_b_param; ///< Argument type for visitor_b
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params() { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args):
|
||||
binary_param(args.binary_arg),
|
||||
visitor_a_param(args.visitor_a_arg),
|
||||
visitor_b_param(args.visitor_b_arg)
|
||||
{ }
|
||||
};
|
||||
|
||||
private:
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
BinaryOp binary_op;
|
||||
|
||||
VisitorA visitor_a_op;
|
||||
VisitorB visitor_b_op;
|
||||
|
||||
public:
|
||||
|
||||
/// Constructs the function object
|
||||
CUTLASS_HOST_DEVICE
|
||||
VisitorOpBinary(
|
||||
Params const ¶ms,
|
||||
SharedStorage &shared_storage,
|
||||
int thread_idx,
|
||||
MatrixCoord threadblock_offset,
|
||||
MatrixCoord problem_size
|
||||
):
|
||||
binary_op(params.binary_param),
|
||||
visitor_a_op(params.visitor_a_param, shared_storage.storage_a, thread_idx, threadblock_offset, problem_size),
|
||||
visitor_b_op(params.visitor_b_param, shared_storage.storage_b, thread_idx, threadblock_offset, problem_size)
|
||||
{ }
|
||||
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_epilogue() {
|
||||
visitor_a_op.begin_epilogue();
|
||||
visitor_b_op.begin_epilogue();
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void set_batch_index(int batch_idx) {
|
||||
visitor_a_op.set_batch_index(batch_idx);
|
||||
visitor_b_op.set_batch_index(batch_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_step(int step_idx) {
|
||||
visitor_a_op.begin_step(step_idx);
|
||||
visitor_b_op.begin_step(step_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_row(int row_idx) {
|
||||
visitor_a_op.begin_row(row_idx);
|
||||
visitor_b_op.begin_row(row_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
VisitAccessType visit(
|
||||
int iter_idx,
|
||||
int row_idx,
|
||||
int column_idx,
|
||||
int frag_idx,
|
||||
AccumulatorAccessType const &accum
|
||||
) {
|
||||
/// Get result from visitor A and visitor B
|
||||
VisitAccessTypeA result_A = visitor_a_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
|
||||
VisitAccessTypeB result_B = visitor_b_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
|
||||
|
||||
/// Type conversion
|
||||
NumericArrayConverter<ElementCompute, ElementA, kElementsPerAccess> source_converter_A;
|
||||
NumericArrayConverter<ElementCompute, ElementB, kElementsPerAccess> source_converter_B;
|
||||
|
||||
return binary_op(
|
||||
source_converter_A(result_A),
|
||||
source_converter_B(result_B)
|
||||
);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_row(int row_idx) {
|
||||
visitor_a_op.end_row(row_idx);
|
||||
visitor_b_op.end_row(row_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_step(int step_idx) {
|
||||
visitor_a_op.end_step(step_idx);
|
||||
visitor_b_op.end_step(step_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_epilogue() {
|
||||
visitor_a_op.end_epilogue();
|
||||
visitor_b_op.end_epilogue();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,250 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A file contains the epilogue visitor Op with broadcasting vector to all columns
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace epilogue {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Epilogue Visitor operator for the following computation:
|
||||
///
|
||||
/// ElementVector T[i][j] <- device-memory Td[i]
|
||||
///
|
||||
/// It can only be a leaf node in the epilogue tree
|
||||
template <
|
||||
typename ElementAccumulator_, ///< Data type of the Accumulator
|
||||
typename ElementFragment_, ///< Data type used to cache vector in register
|
||||
typename InputTileIterator_ ///< Tile iterator type to read the broadcasted tensor
|
||||
>
|
||||
class VisitorOpColumnBroadcast {
|
||||
public:
|
||||
using InputTileIterator = InputTileIterator_;
|
||||
|
||||
static int const kElementsPerAccess = InputTileIterator::kElementsPerAccess;
|
||||
using ElementAccumulator = ElementAccumulator_;
|
||||
using ElementVector = typename InputTileIterator::Element;
|
||||
using ElementFragment = ElementFragment_;
|
||||
|
||||
using VisitAccessType = Array<ElementFragment, kElementsPerAccess>;
|
||||
|
||||
/// Thread map used by input tile iterators
|
||||
using ThreadMap = typename InputTileIterator::ThreadMap;
|
||||
|
||||
/// Fragment object used to store the broadcast values
|
||||
using BroadcastFragment = Array<
|
||||
ElementFragment, kElementsPerAccess>;
|
||||
|
||||
/// Fragment type of accumulator
|
||||
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
|
||||
|
||||
/// Used for the broadcast
|
||||
struct BroadcastDetail {
|
||||
/// Number of threads per warp
|
||||
static int const kWarpSize = 32;
|
||||
|
||||
static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
|
||||
|
||||
/// Number of distinct scalar column indices handled by each thread
|
||||
static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
|
||||
|
||||
/// Number of distinct scalar row indices handled by each thread
|
||||
static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
|
||||
|
||||
/// Number of threads per threadblock
|
||||
static int const kThreadCount = ThreadMap::kThreads;
|
||||
|
||||
/// Number of distinct threads per row of output tile
|
||||
static int const kThreadsPerRow = (InputTileIterator::Shape::kN / kColumnsPerThread);
|
||||
|
||||
/// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
|
||||
static int const kThreadRows = kThreadCount / kThreadsPerRow;
|
||||
|
||||
// /// Number of iterations (accesses) the threadblock takes to reduce a row
|
||||
// static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
|
||||
};
|
||||
|
||||
// using ComputeFragmentType = Array<ElementVector, BroadcastDetail::kElementsPerAccess>;
|
||||
|
||||
struct SharedStorage {
|
||||
CUTLASS_HOST_DEVICE
|
||||
SharedStorage() { }
|
||||
};
|
||||
|
||||
/// Host-constructable Argument structure
|
||||
struct Arguments {
|
||||
ElementVector *broadcast_ptr; ///< Pointer to the additional tensor operand
|
||||
int64_t batch_stride;
|
||||
|
||||
/// Methods
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments():
|
||||
broadcast_ptr(nullptr) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(
|
||||
ElementVector *broadcast_ptr,
|
||||
int64_t batch_stride
|
||||
):
|
||||
broadcast_ptr(broadcast_ptr),
|
||||
batch_stride(batch_stride) { }
|
||||
};
|
||||
|
||||
/// Param structure
|
||||
struct Params {
|
||||
ElementVector *broadcast_ptr; ///< Pointer to the additional tensor operand
|
||||
int64_t batch_stride;
|
||||
|
||||
/// Method
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params():
|
||||
broadcast_ptr(nullptr) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args):
|
||||
broadcast_ptr(args.broadcast_ptr),
|
||||
batch_stride(args.batch_stride) { }
|
||||
};
|
||||
|
||||
private:
|
||||
ElementVector *broadcast_ptr;
|
||||
BroadcastFragment broadcast_fragment; ///< Array holds the loaded broadcast fragment
|
||||
MatrixCoord threadblock_offset_;
|
||||
int thread_idx_;
|
||||
MatrixCoord problem_size;
|
||||
|
||||
int thread_start_row_;
|
||||
int state_[3];
|
||||
int thread_offset_row_;
|
||||
|
||||
int64_t batch_stride_;
|
||||
|
||||
public:
|
||||
/// Constructs the function object
|
||||
CUTLASS_HOST_DEVICE
|
||||
VisitorOpColumnBroadcast(
|
||||
Params const ¶ms,
|
||||
SharedStorage &shared_storage,
|
||||
int thread_idx,
|
||||
MatrixCoord threadblock_offset,
|
||||
MatrixCoord problem_size
|
||||
):
|
||||
broadcast_ptr(params.broadcast_ptr),
|
||||
threadblock_offset_(threadblock_offset),
|
||||
thread_idx_(thread_idx),
|
||||
problem_size(problem_size),
|
||||
thread_start_row_(ThreadMap::initial_offset(thread_idx).row() + threadblock_offset.row()),
|
||||
batch_stride_(params.batch_stride)
|
||||
{
|
||||
state_[0] = state_[1] = state_[2] = 0;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void set_batch_index(int batch_idx) {
|
||||
broadcast_ptr += batch_idx * batch_stride_;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_epilogue() { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_step(int step_idx) {}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_row(int row_idx) {}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
VisitAccessType visit(
|
||||
int iter_idx,
|
||||
int row_idx,
|
||||
int column_idx,
|
||||
int frag_idx,
|
||||
AccumulatorAccessType const &accum
|
||||
) {
|
||||
// get pointer
|
||||
thread_offset_row_ = thread_start_row_ + ThreadMap::iteration_offset(frag_idx).row();
|
||||
|
||||
ElementFragment broadcast_data = ElementFragment(*(broadcast_ptr + thread_offset_row_));
|
||||
|
||||
broadcast_fragment.fill(broadcast_data);
|
||||
|
||||
return broadcast_fragment;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_row(int row_idx) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_step(int step_idx) {
|
||||
// run operator ++
|
||||
++state_[0];
|
||||
|
||||
thread_start_row_ += ThreadMap::Shape::kRow;
|
||||
if (state_[0] == ThreadMap::Count::kRow) {
|
||||
state_[0] = 0;
|
||||
++state_[1];
|
||||
thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
|
||||
ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
|
||||
|
||||
if (state_[1] == ThreadMap::Count::kGroup) {
|
||||
state_[1] = 0;
|
||||
++state_[2];
|
||||
thread_start_row_ += ThreadMap::Count::kGroup *
|
||||
ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
|
||||
|
||||
if (state_[2] == ThreadMap::Count::kCluster) {
|
||||
state_[2] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_epilogue() { }
|
||||
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,341 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A file contains the epilogue visitor Op with reduction over columns in CTA
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace epilogue {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Epilogue Visitor operator for the following computation:
|
||||
///
|
||||
/// ElementReductionAccumulator R[j] = \sum_i ElementReductionAccumulator(T[i][j])
|
||||
/// device memory <- ElementReduction(R[j])
|
||||
///
|
||||
template <
|
||||
typename ThreadblockShape_, /// Threadblock shape
|
||||
typename ElementAccumulator_, ///< Data type of the Accumulator
|
||||
typename ElementReduction_, ///< Data type of the output reduction in device memory
|
||||
typename ElementReductionAccumulator_ , ///< Data type to accumulate reduction in smem and register
|
||||
typename OutputTileIterator_, ///< Tile Iterator type
|
||||
typename Visitor_ ///< preceding visitor op
|
||||
>
|
||||
class VisitorOpColumnReduction {
|
||||
public:
|
||||
using ElementAccumulator = ElementAccumulator_;
|
||||
using ElementReductionAccumulator = ElementReductionAccumulator_;
|
||||
using ElementReduction = ElementReduction_;
|
||||
using OutputTileIterator = OutputTileIterator_;
|
||||
using ThreadblockShape = ThreadblockShape_;
|
||||
using Visitor = Visitor_;
|
||||
|
||||
static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
|
||||
|
||||
using ReductionOp = cutlass::plus<Array<ElementReductionAccumulator, kElementsPerAccess>>;
|
||||
using ReductionOpScalar = cutlass::plus<ElementReductionAccumulator>;
|
||||
using ElementOutput = typename OutputTileIterator::Element;
|
||||
|
||||
|
||||
|
||||
/// Fragment type returned from Visitor
|
||||
using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
|
||||
using ElementVisitor = typename VisitAccessTypeVisitor::Element;
|
||||
|
||||
using VisitAccessType = VisitAccessTypeVisitor;
|
||||
|
||||
/// Fragment type of accumulator
|
||||
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
|
||||
|
||||
/// Fragment type of reduction
|
||||
using ReductionAccumulatorAccessType = Array<ElementReductionAccumulator, kElementsPerAccess>;
|
||||
|
||||
/// Thread map used by output tile iterators
|
||||
using ThreadMap = typename OutputTileIterator::ThreadMap;
|
||||
/// Used for the reduction
|
||||
struct ReductionDetail {
|
||||
|
||||
/// Number of threads per warp
|
||||
static int const kWarpSize = 32;
|
||||
|
||||
/// Number of distinct scalar column indices handled by each thread
|
||||
static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
|
||||
|
||||
/// Number of distinct scalar row indices handled by each thread
|
||||
static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
|
||||
|
||||
/// Number of threads per threadblock
|
||||
static int const kThreadCount = ThreadMap::kThreads;
|
||||
|
||||
/// Number of distinct threads per row of output tile
|
||||
static int const kThreadsPerRow = ThreadblockShape::kN / kColumnsPerThread;
|
||||
|
||||
/// Number of distinct threads which must be reduced during the final reduction phase within the threadblock
|
||||
static int const kThreadRows = kThreadCount / kThreadsPerRow;
|
||||
|
||||
/// Number of iterations (accesses) the threadblock takes to reduce a row
|
||||
static int const kThreadAccessesPerRow = const_max(1, (ThreadblockShape::kN + kThreadCount - 1) / kThreadCount);
|
||||
|
||||
using StorageShape = MatrixShape<
|
||||
kThreadRows,
|
||||
ThreadblockShape::kN
|
||||
>;
|
||||
};
|
||||
|
||||
using ReductionFragment = Array<ElementReductionAccumulator, ReductionDetail::kColumnsPerThread>;
|
||||
|
||||
/// Shared storage
|
||||
struct SharedStorage {
|
||||
typename Visitor::SharedStorage storage_visitor;
|
||||
AlignedArray<ElementReductionAccumulator, ReductionDetail::StorageShape::kCount, 16> reduction;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
SharedStorage() {}
|
||||
};
|
||||
|
||||
/// Host-constructable Argument structure
|
||||
struct Arguments {
|
||||
ElementReduction *reduction_ptr; ///< Pointer to the reduction tensor in device memory
|
||||
int64_t batch_stride;
|
||||
typename Visitor::Arguments visitor_arg; ///< Argument type of visitor
|
||||
|
||||
/// Method
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(): reduction_ptr(nullptr) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(
|
||||
ElementReduction *reduction_ptr,
|
||||
int64_t batch_stride,
|
||||
typename Visitor::Arguments visitor_arg
|
||||
):
|
||||
reduction_ptr(reduction_ptr),
|
||||
batch_stride(batch_stride),
|
||||
visitor_arg(visitor_arg)
|
||||
{ }
|
||||
};
|
||||
|
||||
/// Param structure
|
||||
struct Params {
|
||||
ElementReduction *reduction_ptr; ///< Pointer to the reduction tensor in device memory
|
||||
int64_t batch_stride;
|
||||
typename Visitor::Params visitor_param; ///< Argument type of visitor
|
||||
|
||||
/// Method
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(): reduction_ptr(nullptr) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args):
|
||||
reduction_ptr(args.reduction_ptr),
|
||||
batch_stride(args.batch_stride),
|
||||
visitor_param(args.visitor_arg)
|
||||
{ }
|
||||
};
|
||||
|
||||
private:
|
||||
ElementReduction *reduction_output_ptr_; ///< Pointer to the reduction tensor in device memory
|
||||
ElementReductionAccumulator *reduction_smem_ptr_; ///< Pointer to the partial reductions in shared memory
|
||||
ReductionFragment reduction_fragment; ///< register fragments that hold the partial reduction
|
||||
Visitor visitor_; ///< visitor
|
||||
int thread_idx_;
|
||||
MatrixCoord threadblock_offset;
|
||||
MatrixCoord problem_size_;
|
||||
int64_t batch_stride_;
|
||||
|
||||
public:
|
||||
|
||||
/// Constructs the function object
|
||||
CUTLASS_HOST_DEVICE
|
||||
VisitorOpColumnReduction(
|
||||
Params const ¶ms,
|
||||
SharedStorage &shared_storage,
|
||||
int thread_idx,
|
||||
MatrixCoord threadblock_offset,
|
||||
MatrixCoord problem_size
|
||||
):
|
||||
visitor_(params.visitor_param, shared_storage.storage_visitor,
|
||||
thread_idx, threadblock_offset, problem_size),
|
||||
reduction_smem_ptr_(shared_storage.reduction.data()),
|
||||
reduction_output_ptr_(params.reduction_ptr),
|
||||
thread_idx_(thread_idx),
|
||||
threadblock_offset(threadblock_offset),
|
||||
problem_size_(problem_size),
|
||||
batch_stride_(params.batch_stride)
|
||||
{ }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void set_batch_index(int batch_idx) {
|
||||
reduction_output_ptr_ += batch_idx * batch_stride_;
|
||||
visitor_.set_batch_index(batch_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_epilogue() {
|
||||
visitor_.begin_epilogue();
|
||||
|
||||
// clear the reduction fragment
|
||||
reduction_fragment.clear();
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_step(int step_idx) {
|
||||
visitor_.begin_step(step_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_row(int row_idx) {
|
||||
visitor_.begin_row(row_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
VisitAccessType visit(
|
||||
int iter_idx,
|
||||
int row_idx,
|
||||
int column_idx,
|
||||
int frag_idx,
|
||||
AccumulatorAccessType const &accum
|
||||
) {
|
||||
/// Get result from visitor
|
||||
VisitAccessTypeVisitor result = visitor_.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
|
||||
|
||||
NumericArrayConverter<ElementReductionAccumulator, ElementVisitor, kElementsPerAccess> reduction_converter;
|
||||
ReductionOp reduction_op;
|
||||
ReductionAccumulatorAccessType* reduction_fragment_ = reinterpret_cast<ReductionAccumulatorAccessType*>(&reduction_fragment);
|
||||
reduction_fragment_[column_idx] = reduction_op(reduction_fragment_[column_idx], reduction_converter(result));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_row(int row_idx) {
|
||||
visitor_.end_row(row_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_step(int step_idx) {
|
||||
visitor_.end_step(step_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_epilogue() {
|
||||
visitor_.end_epilogue();
|
||||
//
|
||||
// Store the partially reduced value to SMEM
|
||||
//
|
||||
|
||||
// Guard against uses of the existing SMEM tile
|
||||
__syncthreads();
|
||||
|
||||
using AccessType = AlignedArray<ElementReductionAccumulator, ThreadMap::kElementsPerAccess>;
|
||||
|
||||
//
|
||||
// Determine a compact thread arrangement to store to SMEM
|
||||
//
|
||||
|
||||
MatrixCoord thread_offset(
|
||||
thread_idx_ / ReductionDetail::kThreadsPerRow,
|
||||
(thread_idx_ % ReductionDetail::kThreadsPerRow) * ThreadMap::kElementsPerAccess
|
||||
);
|
||||
|
||||
//
|
||||
// Each thread store its fragment to a SMEM
|
||||
//
|
||||
AccessType *aligned_reduction_ptr = reinterpret_cast<AccessType *>(
|
||||
&reduction_smem_ptr_[thread_offset.row() * ThreadblockShape::kN + thread_offset.column()]
|
||||
);
|
||||
|
||||
AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(
|
||||
&reduction_fragment
|
||||
);
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
|
||||
int col_idx = column * ThreadMap::Delta::kColumn / ThreadMap::kElementsPerAccess;
|
||||
|
||||
aligned_reduction_ptr[col_idx] = frag_ptr[column];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
//
|
||||
// Now, threads are assigned several columns of the output. The fetch over all rows from
|
||||
// the compacted SMEM tile and perform a reduction.
|
||||
//
|
||||
|
||||
NumericConverter<ElementReduction, ElementReductionAccumulator> output_converter;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int j = 0; j < ReductionDetail::kThreadAccessesPerRow; ++j) {
|
||||
int column_idx = thread_idx_ + j * ReductionDetail::kThreadCount;
|
||||
|
||||
ReductionOpScalar reduction_op;
|
||||
ElementReductionAccumulator reduction_element = ElementReductionAccumulator();
|
||||
|
||||
int output_column_idx = threadblock_offset.column() + column_idx;
|
||||
|
||||
if (column_idx < ThreadblockShape::kN && output_column_idx < problem_size_.column()) {
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int row = 0; row < ReductionDetail::kThreadRows; ++row) {
|
||||
if (row) {
|
||||
auto frag = reduction_smem_ptr_[row * ThreadblockShape::kN + column_idx];
|
||||
reduction_element = reduction_op(reduction_element, frag);
|
||||
}
|
||||
else {
|
||||
|
||||
reduction_element = reduction_smem_ptr_[column_idx];
|
||||
}
|
||||
}
|
||||
|
||||
// Store
|
||||
reduction_output_ptr_[column_idx + threadblock_offset.column() + threadblock_offset.row() / ThreadblockShape::kM * problem_size_.column()] = output_converter(reduction_element);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,266 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A file contains the epilogue visitor Op with Linear Combination
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace epilogue {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Epilogue Visitor operator for the following computation:
|
||||
///
|
||||
/// ElementCompute alpha;
|
||||
/// ElementCompute beta;
|
||||
/// ElementCompute C = BinaryOp(alpha * ElementCompute(Visitor_A), beta * ElementCompute(Visitor_B)
|
||||
/// Return C;
|
||||
///
|
||||
template <
|
||||
typename ElementAccumulator_, ///< Data type of the Accumulator
|
||||
typename ElementCompute_, ///< Data type used to compute linear combination
|
||||
int kElementsPerAccess_, ///< Number of elements computed per operation
|
||||
typename VisitorA_, ///< Child node A
|
||||
typename VisitorB_ ///< Child node B
|
||||
>
|
||||
class VisitorOpLinearCombination{
|
||||
public:
|
||||
using ElementAccumulator = ElementAccumulator_;
|
||||
using ElementCompute = ElementCompute_;
|
||||
static int const kElementsPerAccess = kElementsPerAccess_;
|
||||
|
||||
using VisitorA = VisitorA_;
|
||||
using VisitorB = VisitorB_;
|
||||
|
||||
/// Fragment type returned from VisitorA.visit
|
||||
using VisitAccessTypeA = typename VisitorA::VisitAccessType;
|
||||
using ElementA = typename VisitAccessTypeA::Element;
|
||||
|
||||
/// Fragment type returned from VisitorB.visit
|
||||
using VisitAccessTypeB = typename VisitorB::VisitAccessType;
|
||||
using ElementB = typename VisitAccessTypeB::Element;
|
||||
|
||||
/// Fragment type returned by this visitor
|
||||
using VisitAccessType = Array<ElementCompute, kElementsPerAccess>;
|
||||
|
||||
/// Fragment type of accumulator
|
||||
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
|
||||
|
||||
/// Combination Op
|
||||
using CombinationOp = cutlass::plus<VisitAccessType>;
|
||||
|
||||
static_assert(kElementsPerAccess==VisitAccessTypeA::kElements, "kElementsPerAccess mismatches with Visitor A");
|
||||
static_assert(kElementsPerAccess==VisitAccessTypeB::kElements, "kElementsPerAccess mismatches with Visitor B");
|
||||
|
||||
/// SMEM buffer class required in the epilogue visitor
|
||||
struct SharedStorage {
|
||||
typename VisitorA::SharedStorage storage_a;
|
||||
typename VisitorB::SharedStorage storage_b;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
SharedStorage() {}
|
||||
};
|
||||
|
||||
|
||||
/// Host-constructable Arguments structure
|
||||
struct Arguments {
|
||||
ElementCompute alpha; ///< scales accumulators
|
||||
ElementCompute beta; ///< scales source tensor
|
||||
typename VisitorA::Arguments visitor_a_arg; ///< Argument type for visitor_a
|
||||
typename VisitorB::Arguments visitor_b_arg; ///< Argument type for visitor_b
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments():
|
||||
alpha(ElementCompute(1)),
|
||||
beta(ElementCompute(0))
|
||||
{ }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(
|
||||
ElementCompute alpha,
|
||||
ElementCompute beta,
|
||||
typename VisitorA::Arguments visitor_a_arg,
|
||||
typename VisitorB::Arguments visitor_b_arg
|
||||
):
|
||||
alpha(alpha),
|
||||
beta(beta),
|
||||
visitor_a_arg(visitor_a_arg),
|
||||
visitor_b_arg(visitor_b_arg)
|
||||
{ }
|
||||
};
|
||||
|
||||
/// Parameter structure
|
||||
struct Params {
|
||||
ElementCompute alpha; ///< scales accumulators
|
||||
ElementCompute beta; ///< scales source tensor
|
||||
typename VisitorA::Params visitor_a_param; ///< Argument type for visitor_a
|
||||
typename VisitorB::Params visitor_b_param; ///< Argument type for visitor_b
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params() { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args):
|
||||
alpha(args.alpha),
|
||||
beta(args.beta),
|
||||
visitor_a_param(args.visitor_a_arg),
|
||||
visitor_b_param(args.visitor_b_arg)
|
||||
{ }
|
||||
};
|
||||
|
||||
private:
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
ElementCompute alpha_;
|
||||
ElementCompute beta_;
|
||||
|
||||
VisitorA visitor_a_op;
|
||||
VisitorB visitor_b_op;
|
||||
|
||||
public:
|
||||
|
||||
/// Constructs the function object
|
||||
CUTLASS_HOST_DEVICE
|
||||
VisitorOpLinearCombination(
|
||||
Params const ¶ms,
|
||||
SharedStorage &shared_storage,
|
||||
int thread_idx,
|
||||
MatrixCoord threadblock_offset,
|
||||
MatrixCoord problem_size
|
||||
):
|
||||
alpha_(params.alpha),
|
||||
beta_(params.beta),
|
||||
visitor_a_op(params.visitor_a_param, shared_storage.storage_a, thread_idx, threadblock_offset, problem_size),
|
||||
visitor_b_op(params.visitor_b_param, shared_storage.storage_b, thread_idx, threadblock_offset, problem_size)
|
||||
{ }
|
||||
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_epilogue() {
|
||||
if (alpha_ != ElementCompute(0)) visitor_a_op.begin_epilogue();
|
||||
if (beta_ != ElementCompute(0)) visitor_b_op.begin_epilogue();
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_step(int step_idx) {
|
||||
if (alpha_ != ElementCompute(0)) visitor_a_op.begin_step(step_idx);
|
||||
if (beta_ != ElementCompute(0)) visitor_b_op.begin_step(step_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_row(int row_idx) {
|
||||
if (alpha_ != ElementCompute(0)) visitor_a_op.begin_row(row_idx);
|
||||
if (beta_ != ElementCompute(0)) visitor_b_op.begin_row(row_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
VisitAccessType visit(
|
||||
int iter_idx,
|
||||
int row_idx,
|
||||
int column_idx,
|
||||
int frag_idx,
|
||||
AccumulatorAccessType const &accum
|
||||
) {
|
||||
/// Get result from visitor A and visitor B
|
||||
VisitAccessTypeA result_A;
|
||||
VisitAccessTypeB result_B;
|
||||
|
||||
if (alpha_ != ElementCompute(0)) {
|
||||
result_A = visitor_a_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
|
||||
} else {
|
||||
// Fill the result A with zeros
|
||||
result_A.clear();
|
||||
}
|
||||
|
||||
if (beta_ != ElementCompute(0)) {
|
||||
result_B = visitor_b_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
|
||||
} else {
|
||||
// Fill the result B with zeros
|
||||
result_B.clear();
|
||||
}
|
||||
|
||||
/// Type conversion
|
||||
NumericArrayConverter<ElementCompute, ElementA, kElementsPerAccess> source_converter_A;
|
||||
NumericArrayConverter<ElementCompute, ElementB, kElementsPerAccess> source_converter_B;
|
||||
|
||||
CombinationOp combination_op;
|
||||
|
||||
cutlass::multiplies<VisitAccessType> multiply_op;
|
||||
|
||||
return combination_op(
|
||||
multiply_op(alpha_, source_converter_A(result_A)),
|
||||
multiply_op(beta_, source_converter_B(result_B))
|
||||
);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_row(int row_idx) {
|
||||
if (alpha_ != ElementCompute(0)) visitor_a_op.end_row(row_idx);
|
||||
if (beta_ != ElementCompute(0)) visitor_b_op.end_row(row_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_step(int step_idx) {
|
||||
if (alpha_ != ElementCompute(0)) visitor_a_op.end_step(step_idx);
|
||||
if (beta_ != ElementCompute(0)) visitor_b_op.end_step(step_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_epilogue() {
|
||||
if (alpha_ != ElementCompute(0)) visitor_a_op.end_epilogue();
|
||||
if (beta_ != ElementCompute(0)) visitor_b_op.end_epilogue();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,258 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A file contains the epilogue visitor Op with broadcasting vector to all rows
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace epilogue {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Epilogue Visitor operator for the following computation:
|
||||
///
|
||||
/// ElementVector T[i][j] <- device-memory Td[j]
|
||||
///
|
||||
/// It can only be a leaf node in the epilogue tree
|
||||
template <
|
||||
typename ElementAccumulator_, ///< Data type of the Accumulator
|
||||
typename ElementFragment_, ///< Data type used to cache vector in register
|
||||
typename InputTileIterator_ ///< Tile iterator type to read the broadcasted tensor
|
||||
>
|
||||
class VisitorOpRowBroadcast {
|
||||
public:
|
||||
using InputTileIterator = InputTileIterator_;
|
||||
|
||||
static int const kElementsPerAccess = InputTileIterator::kElementsPerAccess;
|
||||
using ElementAccumulator = ElementAccumulator_;
|
||||
using ElementVector = typename InputTileIterator::Element;
|
||||
using ElementFragment = ElementFragment_;
|
||||
|
||||
using VisitAccessType = Array<ElementFragment, kElementsPerAccess>;
|
||||
|
||||
/// Thread map used by input tile iterators
|
||||
using ThreadMap = typename InputTileIterator::ThreadMap;
|
||||
|
||||
/// Fragment object used to store the broadcast values
|
||||
using BroadcastFragment = Array<
|
||||
ElementFragment,
|
||||
ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
|
||||
|
||||
/// Fragment type of accumulator
|
||||
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
|
||||
|
||||
/// Used for the broadcast
|
||||
struct BroadcastDetail {
|
||||
/// Number of threads per warp
|
||||
static int const kWarpSize = 32;
|
||||
|
||||
static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
|
||||
|
||||
/// Number of distinct scalar column indices handled by each thread
|
||||
static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
|
||||
|
||||
/// Number of distinct scalar row indices handled by each thread
|
||||
static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
|
||||
|
||||
/// Number of threads per threadblock
|
||||
static int const kThreadCount = ThreadMap::kThreads;
|
||||
|
||||
/// Number of distinct threads per row of output tile
|
||||
static int const kThreadsPerRow = (InputTileIterator::Shape::kN / kColumnsPerThread);
|
||||
|
||||
/// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
|
||||
static int const kThreadRows = kThreadCount / kThreadsPerRow;
|
||||
|
||||
// /// Number of iterations (accesses) the threadblock takes to reduce a row
|
||||
// static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
|
||||
};
|
||||
|
||||
// using ComputeFragmentType = Array<ElementVector, BroadcastDetail::kElementsPerAccess>;
|
||||
|
||||
struct SharedStorage {
|
||||
CUTLASS_HOST_DEVICE
|
||||
SharedStorage() { }
|
||||
};
|
||||
|
||||
/// Host-constructable Argument structure
|
||||
struct Arguments {
|
||||
ElementVector *broadcast_ptr; ///< Pointer to the additional tensor operand
|
||||
int64_t batch_stride;
|
||||
|
||||
/// Methods
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments():
|
||||
broadcast_ptr(nullptr) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(
|
||||
ElementVector *broadcast_ptr,
|
||||
int64_t batch_stride
|
||||
):
|
||||
broadcast_ptr(broadcast_ptr),
|
||||
batch_stride(batch_stride) { }
|
||||
};
|
||||
|
||||
/// Param structure
|
||||
struct Params {
|
||||
ElementVector *broadcast_ptr; ///< Pointer to the additional tensor operand
|
||||
int64_t batch_stride;
|
||||
|
||||
/// Method
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params():
|
||||
broadcast_ptr(nullptr) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args):
|
||||
broadcast_ptr(args.broadcast_ptr),
|
||||
batch_stride(args.batch_stride) { }
|
||||
};
|
||||
|
||||
private:
|
||||
ElementVector *broadcast_ptr;
|
||||
BroadcastFragment broadcast_fragment; ///< Array holds the loaded broadcast fragment
|
||||
MatrixCoord threadblock_offset_;
|
||||
int thread_idx_;
|
||||
MatrixCoord problem_size;
|
||||
int64_t batch_stride_;
|
||||
|
||||
public:
|
||||
/// Constructs the function object
|
||||
CUTLASS_HOST_DEVICE
|
||||
VisitorOpRowBroadcast(
|
||||
Params const ¶ms,
|
||||
SharedStorage &shared_storage,
|
||||
int thread_idx,
|
||||
MatrixCoord threadblock_offset,
|
||||
MatrixCoord problem_size
|
||||
):
|
||||
broadcast_ptr(params.broadcast_ptr + threadblock_offset.column()),
|
||||
threadblock_offset_(threadblock_offset),
|
||||
thread_idx_(thread_idx),
|
||||
problem_size(problem_size),
|
||||
batch_stride_(params.batch_stride) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void set_batch_index(int batch_idx) {
|
||||
broadcast_ptr += batch_idx * batch_stride_;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_epilogue() {
|
||||
// load broadcast fragment
|
||||
load_broadcast_fragment_();
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_step(int step_idx) {}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_row(int row_idx) {}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
VisitAccessType visit(
|
||||
int iter_idx,
|
||||
int row_idx,
|
||||
int column_idx,
|
||||
int frag_idx,
|
||||
AccumulatorAccessType const &accum
|
||||
) {
|
||||
VisitAccessType* broadcast_fragment_ = reinterpret_cast<VisitAccessType*>(&broadcast_fragment);
|
||||
return broadcast_fragment_[column_idx];
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_row(int row_idx) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_step(int step_idx) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_epilogue() { }
|
||||
|
||||
private:
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void load_broadcast_fragment_() {
|
||||
|
||||
broadcast_fragment.clear();
|
||||
|
||||
// If no pointer is supplied, set with all zeros and avoid memory accesses
|
||||
if (!broadcast_ptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
|
||||
|
||||
int thread_column_idx = threadblock_offset_.column() + thread_initial_column;
|
||||
broadcast_ptr += thread_initial_column;
|
||||
|
||||
NumericArrayConverter<ElementFragment, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
|
||||
using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
|
||||
using AccessFragmentType = Array<ElementFragment, BroadcastDetail::kElementsPerAccess>;
|
||||
|
||||
AccessFragmentType *frag_ptr = reinterpret_cast<AccessFragmentType *>(&broadcast_fragment);
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
|
||||
|
||||
AccessType loaded;
|
||||
|
||||
loaded.clear();
|
||||
|
||||
if (thread_column_idx < problem_size.column()) {
|
||||
loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
|
||||
}
|
||||
|
||||
AccessFragmentType cvt = converter(loaded);
|
||||
frag_ptr[j] = cvt;
|
||||
|
||||
thread_column_idx += ThreadMap::Delta::kColumn;
|
||||
broadcast_ptr += ThreadMap::Delta::kColumn;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,319 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A file contains the epilogue visitor Op with reduction over rows in CTA
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "stdio.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace epilogue {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/// Epilogue Visitor operator for the following computation:
|
||||
///
|
||||
/// ElementReductionAccumulator R[i] = \sum_i ElementReductionAccumulator(T[i][j])
|
||||
/// device memory <- ElementReduction(R[i])
|
||||
///
|
||||
template <
|
||||
typename ThreadblockShape_, /// Threadblock shape
|
||||
typename ElementAccumulator_, ///< Data type of the Accumulator
|
||||
typename ElementReduction_, ///< Data type of the output reduction in device memory
|
||||
typename ElementReductionAccumulator_ , ///< Data type to accumulate reduction in smem and register
|
||||
typename OutputTileIterator_, ///< Tile Iterator type
|
||||
typename Visitor_ ///< preceding visitor op
|
||||
>
|
||||
class VisitorOpRowReduction {
|
||||
public:
|
||||
using ElementAccumulator = ElementAccumulator_;
|
||||
using ElementReductionAccumulator = ElementReductionAccumulator_;
|
||||
using ElementReduction = ElementReduction_;
|
||||
using OutputTileIterator = OutputTileIterator_;
|
||||
using ThreadblockShape = ThreadblockShape_;
|
||||
using Visitor = Visitor_;
|
||||
|
||||
static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
|
||||
|
||||
using ReductionOp = cutlass::plus<Array<ElementReductionAccumulator, kElementsPerAccess>>;
|
||||
using ReductionOpScalar = cutlass::plus<ElementReductionAccumulator>;
|
||||
using ElementOutput = typename OutputTileIterator::Element;
|
||||
|
||||
/// Fragment type returned from Visitor
|
||||
using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
|
||||
using ElementVisitor = typename VisitAccessTypeVisitor::Element;
|
||||
|
||||
using VisitAccessType = VisitAccessTypeVisitor;
|
||||
|
||||
/// Fragment type of accumulator
|
||||
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
|
||||
|
||||
/// Fragment type of reduction
|
||||
using ReductionAccumulatorAccessType = Array<ElementReductionAccumulator, kElementsPerAccess>;
|
||||
|
||||
/// Thread map used by output tile iterators
|
||||
using ThreadMap = typename OutputTileIterator::ThreadMap;
|
||||
/// Used for the reduction
|
||||
struct ReductionDetail {
|
||||
|
||||
/// Number of threads per warp
|
||||
static int const kWarpSize = 32;
|
||||
|
||||
/// Number of distinct scalar column indices handled by each thread
|
||||
static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
|
||||
|
||||
/// Number of distinct scalar row indices handled by each thread
|
||||
static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
|
||||
|
||||
/// Number of threads per threadblock
|
||||
static int const kThreadCount = ThreadMap::kThreads;
|
||||
|
||||
/// Number of distinct threads per row of output tile
|
||||
static int const kThreadsPerRow = ThreadblockShape::kN / kColumnsPerThread;
|
||||
|
||||
/// Half number of threads per row used for cross-thread reduction
|
||||
static int const kHalfThreadsPerRow = (kThreadsPerRow >> 1);
|
||||
|
||||
/// Number of distinct threads which must be reduced during the final reduction phase within the threadblock
|
||||
static int const kThreadRows = kThreadCount / kThreadsPerRow;
|
||||
};
|
||||
|
||||
/// Shared storage
|
||||
struct SharedStorage {
|
||||
typename Visitor::SharedStorage storage_visitor;
|
||||
CUTLASS_HOST_DEVICE
|
||||
SharedStorage() { }
|
||||
};
|
||||
|
||||
/// Host-constructable Argument structure
|
||||
struct Arguments {
|
||||
ElementReduction *reduction_ptr; ///< Pointer to the reduction tensor in device memory
|
||||
int64_t batch_stride;
|
||||
typename Visitor::Arguments visitor_arg; ///< Argument type of visitor
|
||||
|
||||
/// Method
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(): reduction_ptr(nullptr) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(
|
||||
ElementReduction *reduction_ptr,
|
||||
int64_t batch_stride,
|
||||
typename Visitor::Arguments visitor_arg
|
||||
):
|
||||
reduction_ptr(reduction_ptr),
|
||||
batch_stride(batch_stride),
|
||||
visitor_arg(visitor_arg)
|
||||
{ }
|
||||
};
|
||||
|
||||
/// Param structure
|
||||
struct Params {
|
||||
ElementReduction *reduction_ptr; ///< Pointer to the reduction tensor in device memory
|
||||
int64_t batch_stride;
|
||||
typename Visitor::Params visitor_param; ///< Argument type of visitor
|
||||
|
||||
/// Method
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(): reduction_ptr(nullptr) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args):
|
||||
reduction_ptr(args.reduction_ptr),
|
||||
batch_stride(args.batch_stride),
|
||||
visitor_param(args.visitor_arg)
|
||||
{ }
|
||||
};
|
||||
|
||||
private:
|
||||
ElementReduction *reduction_output_ptr_; ///< Pointer to the reduction tensor in device memory
|
||||
ElementReductionAccumulator reduction_accum;
|
||||
Visitor visitor_; ///< visitor
|
||||
int thread_idx_;
|
||||
MatrixCoord threadblock_offset;
|
||||
MatrixCoord problem_size_;
|
||||
|
||||
int thread_start_row_; /// used to identify
|
||||
int state_[3]; /// used to track row iterator
|
||||
int thread_offset_row_;
|
||||
int64_t batch_stride_;
|
||||
public:
|
||||
/// Constructs the function object
|
||||
CUTLASS_HOST_DEVICE
|
||||
VisitorOpRowReduction(
|
||||
Params const ¶ms,
|
||||
SharedStorage &shared_storage,
|
||||
int thread_idx,
|
||||
MatrixCoord threadblock_offset,
|
||||
MatrixCoord problem_size
|
||||
):
|
||||
visitor_(params.visitor_param, shared_storage.storage_visitor,
|
||||
thread_idx, threadblock_offset, problem_size),
|
||||
reduction_output_ptr_(params.reduction_ptr),
|
||||
thread_idx_(thread_idx),
|
||||
threadblock_offset(threadblock_offset),
|
||||
problem_size_(problem_size),
|
||||
thread_start_row_(ThreadMap::initial_offset(thread_idx).row() + threadblock_offset.row()),
|
||||
batch_stride_(params.batch_stride)
|
||||
{
|
||||
state_[0] = state_[1] = state_[2] = 0;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void set_batch_index(int batch_idx) {
|
||||
reduction_output_ptr_ += batch_idx * batch_stride_;
|
||||
visitor_.set_batch_index(batch_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_epilogue() {
|
||||
visitor_.begin_epilogue();
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_step(int step_idx) {
|
||||
visitor_.begin_step(step_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_row(int row_idx) {
|
||||
visitor_.begin_row(row_idx);
|
||||
|
||||
reduction_accum = ElementReductionAccumulator(0);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
VisitAccessType visit(
|
||||
int iter_idx,
|
||||
int row_idx,
|
||||
int column_idx,
|
||||
int frag_idx,
|
||||
AccumulatorAccessType const &accum
|
||||
) {
|
||||
/// Get result from visitor
|
||||
VisitAccessTypeVisitor result = visitor_.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
|
||||
|
||||
thread_offset_row_ = thread_start_row_ + ThreadMap::iteration_offset(frag_idx).row();
|
||||
|
||||
ReductionOpScalar reduction_op;
|
||||
|
||||
ElementReductionAccumulator reduction_accum_ = reduction(result);
|
||||
|
||||
// After performing the in-thread reduction, we then perform cross-thread / in-warp reduction
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = ReductionDetail::kHalfThreadsPerRow; i > 0; i >>= 1) {
|
||||
reduction_accum_ = reduction_op(reduction_accum_, __shfl_xor_sync(0xFFFFFFFF, reduction_accum_, i));
|
||||
}
|
||||
reduction_accum = reduction_op(reduction_accum, reduction_accum_);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_row(int row_idx) {
|
||||
visitor_.end_row(row_idx);
|
||||
NumericConverter<ElementReduction, ElementReductionAccumulator> output_converter;
|
||||
|
||||
bool is_write_thread = (thread_offset_row_ < problem_size_.row() && (thread_idx_ % ReductionDetail::kThreadsPerRow) == 0);
|
||||
int row_offset = thread_offset_row_ + threadblock_offset.column() / ThreadblockShape::kN * problem_size_.row();
|
||||
|
||||
ElementReduction *curr_ptr_reduction = reduction_output_ptr_ + row_offset;
|
||||
|
||||
arch::global_store<ElementReduction, sizeof(ElementReduction)>(
|
||||
output_converter(reduction_accum),
|
||||
(void *)curr_ptr_reduction,
|
||||
is_write_thread);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_step(int step_idx) {
|
||||
visitor_.end_step(step_idx);
|
||||
|
||||
// run operator ++
|
||||
++state_[0];
|
||||
|
||||
thread_start_row_ += ThreadMap::Shape::kRow;
|
||||
if (state_[0] == ThreadMap::Count::kRow) {
|
||||
state_[0] = 0;
|
||||
++state_[1];
|
||||
thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
|
||||
ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
|
||||
|
||||
if (state_[1] == ThreadMap::Count::kGroup) {
|
||||
state_[1] = 0;
|
||||
++state_[2];
|
||||
thread_start_row_ += ThreadMap::Count::kGroup *
|
||||
ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
|
||||
|
||||
if (state_[2] == ThreadMap::Count::kCluster) {
|
||||
state_[2] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_epilogue() {
|
||||
visitor_.end_epilogue();
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
CUTLASS_DEVICE
|
||||
ElementReductionAccumulator reduction(VisitAccessTypeVisitor const& result) {
|
||||
ElementReductionAccumulator sum_ = ElementReductionAccumulator(0);
|
||||
|
||||
ReductionOpScalar reduction_op;
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < VisitAccessTypeVisitor::kElements; ++i) {
|
||||
sum_ = reduction_op(sum_, result[i]);
|
||||
}
|
||||
|
||||
return sum_;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,188 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A file contains the epilogue visitor Op with Tensor Output
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cutlass/cutlass.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace epilogue {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Epilogue Visitor operator for the following computation:
|
||||
///
|
||||
/// ElementInput C <- device memory
|
||||
///
|
||||
/// It can only be a leaf node in the epilogue tree
|
||||
template <
|
||||
typename ElementAccumulator_, ///< Data type of the Accumulator
|
||||
typename InputTileIterator_ ///< Tile iterator type to read the tensor
|
||||
>
|
||||
class VisitorOpTensorInput {
|
||||
public:
|
||||
using ElementAccumulator = ElementAccumulator_;
|
||||
using InputTileIterator = InputTileIterator_;
|
||||
|
||||
static int const kElementsPerAccess = InputTileIterator::kElementsPerAccess;
|
||||
using ElementInput = typename InputTileIterator::Element;
|
||||
|
||||
using VisitAccessType = Array<ElementInput, kElementsPerAccess>;
|
||||
|
||||
/// Fragment type of accumulator
|
||||
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
|
||||
|
||||
struct SharedStorage {
|
||||
CUTLASS_HOST_DEVICE
|
||||
SharedStorage() { }
|
||||
};
|
||||
|
||||
/// Host-constructable Argument structure
|
||||
struct Arguments {
|
||||
ElementInput *input_ptr; ///< Pointer to the input tensor in device memory
|
||||
int ldt; ///< Leading dimension of the input tensor operand
|
||||
int64_t batch_stride; ///< batch stride for batched GEMM
|
||||
|
||||
/// Methods
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(): input_ptr(nullptr) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(
|
||||
ElementInput *input_ptr,
|
||||
int ldt, int64_t batch_stride
|
||||
):
|
||||
input_ptr(input_ptr),
|
||||
ldt(ldt),
|
||||
batch_stride(batch_stride)
|
||||
{ }
|
||||
};
|
||||
|
||||
/// Param structure
|
||||
struct Params {
|
||||
typename InputTileIterator::Params params_input;
|
||||
ElementInput *input_ptr;
|
||||
int64_t batch_stride;
|
||||
|
||||
/// Method
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params():
|
||||
input_ptr(nullptr) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args):
|
||||
params_input(args.ldt),
|
||||
input_ptr(args.input_ptr),
|
||||
batch_stride(args.batch_stride)
|
||||
{ }
|
||||
};
|
||||
|
||||
private:
|
||||
InputTileIterator iterator_T_;
|
||||
typename InputTileIterator::Fragment fragment_T_;
|
||||
MatrixCoord problem_size;
|
||||
int64_t batch_stride_;
|
||||
|
||||
public:
|
||||
/// Constructs the function object
|
||||
CUTLASS_HOST_DEVICE
|
||||
VisitorOpTensorInput(
|
||||
Params const ¶ms,
|
||||
SharedStorage &shared_storage,
|
||||
int thread_idx,
|
||||
MatrixCoord threadblock_offset,
|
||||
MatrixCoord problem_size
|
||||
):
|
||||
iterator_T_(
|
||||
InputTileIterator(
|
||||
params.params_input,
|
||||
params.input_ptr,
|
||||
problem_size,
|
||||
thread_idx,
|
||||
threadblock_offset
|
||||
)
|
||||
),
|
||||
problem_size(problem_size),
|
||||
batch_stride_(params.batch_stride) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void set_batch_index(int batch_idx) {
|
||||
iterator_T_.add_pointer_offset(batch_idx * batch_stride_);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_epilogue() { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_step(int step_idx) {
|
||||
fragment_T_.clear();
|
||||
iterator_T_.load(fragment_T_);
|
||||
++iterator_T_;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_row(int row_idx) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
VisitAccessType visit(
|
||||
int iter_idx,
|
||||
int row_idx,
|
||||
int column_idx,
|
||||
int frag_idx,
|
||||
AccumulatorAccessType const &accum
|
||||
) {
|
||||
VisitAccessType source = reinterpret_cast<VisitAccessType *>(&fragment_T_)[frag_idx];
|
||||
return source;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_row(int row_idx) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_step(int step_idx) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_epilogue() { }
|
||||
};
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,240 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A file contains the epilogue visitor Op with Tensor Output
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "stdio.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace epilogue {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/// Epilogue Visitor operator for the following computation:
|
||||
///
|
||||
/// ElementOutput T = ElementOutput(Visitor)
|
||||
/// T-> device memory
|
||||
///
|
||||
template <
|
||||
typename ElementAccumulator_, ///< Data type of the Accumulator
|
||||
typename OutputTileIterator_, ///< Tile iterator type to write the tensor
|
||||
typename Visitor_ ///< Child visitor that produces the output tensor
|
||||
>
|
||||
class VisitorOpTensorOutput {
|
||||
public:
|
||||
using ElementAccumulator = ElementAccumulator_;
|
||||
using OutputTileIterator = OutputTileIterator_;
|
||||
|
||||
static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
|
||||
using ElementOutput = typename OutputTileIterator::Element;
|
||||
|
||||
using Visitor = Visitor_;
|
||||
|
||||
/// Fragment type returned from Visitor
|
||||
using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
|
||||
using ElementVisitor = typename VisitAccessTypeVisitor::Element;
|
||||
|
||||
using VisitAccessType = VisitAccessTypeVisitor;
|
||||
|
||||
/// Fragment type of accumulator
|
||||
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
|
||||
|
||||
/// Fragment type of output
|
||||
using OutputAccessType = Array<ElementOutput, kElementsPerAccess>;
|
||||
|
||||
static_assert(kElementsPerAccess==VisitAccessTypeVisitor::kElements, "kElementsPerAccess mismatches with Visitor");
|
||||
|
||||
struct SharedStorage {
|
||||
typename Visitor::SharedStorage storage_visitor;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
SharedStorage() { }
|
||||
};
|
||||
|
||||
/// Host-constructable Argument structure
|
||||
struct Arguments {
|
||||
ElementOutput *output_ptr; ///< Pointer to the output tensor in device memory
|
||||
int ldt; ///< Leading dimension of the output tensor operand
|
||||
int64_t batch_stride; ///< batch stride
|
||||
typename Visitor::Arguments visitor_arg; ///< Argument type of visitor
|
||||
|
||||
/// Methods
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(): output_ptr(nullptr) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(
|
||||
ElementOutput *output_ptr,
|
||||
int ldt,
|
||||
int64_t batch_stride,
|
||||
typename Visitor::Arguments visitor_arg
|
||||
):
|
||||
output_ptr(output_ptr),
|
||||
ldt(ldt),
|
||||
batch_stride(batch_stride),
|
||||
visitor_arg(visitor_arg)
|
||||
{ }
|
||||
};
|
||||
|
||||
/// Param structure
|
||||
struct Params {
|
||||
typename OutputTileIterator::Params params_output;
|
||||
ElementOutput *output_ptr;
|
||||
int64_t batch_stride;
|
||||
typename Visitor::Params visitor_param;
|
||||
|
||||
/// Method
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params():
|
||||
output_ptr(nullptr) { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args):
|
||||
params_output(args.ldt),
|
||||
output_ptr(args.output_ptr),
|
||||
batch_stride(args.batch_stride),
|
||||
visitor_param(args.visitor_arg)
|
||||
{ }
|
||||
};
|
||||
|
||||
private:
|
||||
OutputTileIterator iterator_T_;
|
||||
typename OutputTileIterator::Fragment fragment_T_;
|
||||
MatrixCoord problem_size;
|
||||
Visitor visitor_;
|
||||
int64_t batch_stride_;
|
||||
|
||||
public:
|
||||
|
||||
/// Constructs the function object
|
||||
CUTLASS_HOST_DEVICE
|
||||
VisitorOpTensorOutput(
|
||||
Params const ¶ms,
|
||||
SharedStorage &shared_storage,
|
||||
int thread_idx,
|
||||
MatrixCoord threadblock_offset,
|
||||
MatrixCoord problem_size
|
||||
):
|
||||
visitor_(params.visitor_param, shared_storage.storage_visitor, thread_idx, threadblock_offset, problem_size),
|
||||
iterator_T_(
|
||||
OutputTileIterator(
|
||||
params.params_output,
|
||||
params.output_ptr,
|
||||
problem_size,
|
||||
thread_idx,
|
||||
threadblock_offset
|
||||
)
|
||||
),
|
||||
problem_size(problem_size),
|
||||
batch_stride_(params.batch_stride) { }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void set_batch_index(int batch_idx) {
|
||||
iterator_T_.add_pointer_offset(batch_idx * batch_stride_);
|
||||
visitor_.set_batch_index(batch_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_epilogue() {
|
||||
visitor_.begin_epilogue();
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_step(int step_idx) {
|
||||
fragment_T_.clear();
|
||||
visitor_.begin_step(step_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_row(int row_idx) {
|
||||
visitor_.begin_row(row_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
VisitAccessType visit(
|
||||
int iter_idx,
|
||||
int row_idx,
|
||||
int column_idx,
|
||||
int frag_idx,
|
||||
AccumulatorAccessType const &accum
|
||||
) {
|
||||
/// Get result from visitor
|
||||
VisitAccessTypeVisitor result = visitor_.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
|
||||
|
||||
// Column guard
|
||||
MatrixCoord thread_offset_ = iterator_T_.thread_start() + OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
|
||||
bool column_guard = (thread_offset_.column() < problem_size.column());
|
||||
|
||||
if (column_guard) {
|
||||
NumericArrayConverter<ElementOutput, ElementVisitor, kElementsPerAccess> output_converter;
|
||||
OutputAccessType &output = reinterpret_cast<OutputAccessType *>(&fragment_T_)[frag_idx];
|
||||
output = output_converter(result);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_row(int row_idx) {
|
||||
visitor_.end_row(row_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_step(int step_idx) {
|
||||
visitor_.end_step(step_idx);
|
||||
iterator_T_.store(fragment_T_);
|
||||
++iterator_T_;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_epilogue() {
|
||||
visitor_.end_epilogue();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,226 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
|
||||
\brief A file contains the epilogue visitor Op with Unary operation
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "unary_ops.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace epilogue {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
/// Epilogue Visitor operator for the following computation:
|
||||
///
|
||||
/// ElementCompute alpha;
|
||||
/// ElementCompute beta;
|
||||
/// ElementCompute C = UnaryOp(ElementCompute(Visitor))
|
||||
/// Return C;
|
||||
///
|
||||
template <
|
||||
typename ElementAccumulator_, ///< Data type of the Accumulator
|
||||
typename ElementCompute_, ///< Data type used to compute linear combination
|
||||
int kElementsPerAccess_, ///< Number of elements computed per operation
|
||||
typename Visitor_, ///< Child node
|
||||
template<typename T, int N> typename UnaryOp_
|
||||
>
|
||||
class VisitorOpUnary{
|
||||
public:
|
||||
using ElementAccumulator = ElementAccumulator_;
|
||||
using ElementCompute = ElementCompute_;
|
||||
static int const kElementsPerAccess = kElementsPerAccess_;
|
||||
|
||||
using Visitor = Visitor_;
|
||||
|
||||
/// Fragment type returned from Visitor.visit
|
||||
using VisitAccessTypeVisitor = typename Visitor::VisitAccessType;
|
||||
using ElementVisit = typename VisitAccessTypeVisitor::Element;
|
||||
|
||||
/// Fragment type returned by this visitor
|
||||
using VisitAccessType = Array<ElementCompute, kElementsPerAccess>;
|
||||
|
||||
/// Fragment type of accumulator
|
||||
using AccumulatorAccessType = Array<ElementAccumulator, kElementsPerAccess>;
|
||||
|
||||
/// Combination Op
|
||||
using UnaryOp = UnaryOp_<ElementCompute, kElementsPerAccess>;
|
||||
|
||||
static_assert(kElementsPerAccess==VisitAccessTypeVisitor::kElements, "kElementsPerAccess mismatches with Visitor");
|
||||
|
||||
/// SMEM buffer class required in the epilogue visitor
|
||||
struct SharedStorage {
|
||||
typename Visitor::SharedStorage storage_visitor;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
SharedStorage() {}
|
||||
};
|
||||
|
||||
|
||||
/// Host-constructable Arguments structure
|
||||
struct Arguments {
|
||||
typename UnaryOp::Arguments unary_arg;
|
||||
typename Visitor::Arguments visitor_arg; ///< Argument type for visitor
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments():unary_arg() { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Arguments(
|
||||
typename UnaryOp::Arguments unary_arg,
|
||||
typename Visitor::Arguments visitor_arg
|
||||
):
|
||||
unary_arg(unary_arg),
|
||||
visitor_arg(visitor_arg)
|
||||
{ }
|
||||
};
|
||||
|
||||
/// Parameter structure
|
||||
struct Params {
|
||||
typename UnaryOp::Params unary_param;
|
||||
typename Visitor::Params visitor_param; ///< Argument type for visitor
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params():unary_param() { }
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args):
|
||||
unary_param(args.unary_arg),
|
||||
visitor_param(args.visitor_arg)
|
||||
{ }
|
||||
};
|
||||
|
||||
private:
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
UnaryOp unary_op;
|
||||
|
||||
Visitor visitor_op;
|
||||
|
||||
public:
|
||||
|
||||
/// Constructs the function object
|
||||
CUTLASS_HOST_DEVICE
|
||||
VisitorOpUnary(
|
||||
Params const ¶ms,
|
||||
SharedStorage &shared_storage,
|
||||
int thread_idx,
|
||||
MatrixCoord threadblock_offset,
|
||||
MatrixCoord problem_size
|
||||
):
|
||||
unary_op(params.unary_param),
|
||||
visitor_op(params.visitor_param, shared_storage.storage_visitor, thread_idx, threadblock_offset, problem_size)
|
||||
{ }
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void set_batch_index(int batch_idx) {
|
||||
visitor_op.set_batch_index(batch_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_epilogue() {
|
||||
if (unary_op.guard()) visitor_op.begin_epilogue();
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_step(int step_idx) {
|
||||
if (unary_op.guard()) visitor_op.begin_step(step_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void begin_row(int row_idx) {
|
||||
if (unary_op.guard()) visitor_op.begin_row(row_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
VisitAccessType visit(
|
||||
int iter_idx,
|
||||
int row_idx,
|
||||
int column_idx,
|
||||
int frag_idx,
|
||||
AccumulatorAccessType const &accum
|
||||
) {
|
||||
/// Get result from visitor A and visitor B
|
||||
VisitAccessTypeVisitor result;
|
||||
|
||||
if (unary_op.guard()){
|
||||
result = visitor_op.visit(iter_idx, row_idx, column_idx, frag_idx, accum);
|
||||
} else {
|
||||
result.clear();
|
||||
}
|
||||
|
||||
/// Type conversion
|
||||
NumericArrayConverter<ElementCompute, ElementVisit, kElementsPerAccess> source_converter;
|
||||
|
||||
cutlass::multiplies<VisitAccessType> multiply_op;
|
||||
|
||||
return unary_op(source_converter(result));
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_row(int row_idx) {
|
||||
if (unary_op.guard()) visitor_op.end_row(row_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_step(int step_idx) {
|
||||
if (unary_op.guard()) visitor_op.end_step(step_idx);
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
void end_epilogue() {
|
||||
if (unary_op.guard()) visitor_op.end_epilogue();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,480 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this layernormware without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
\brief Epilogue visitor type used for partial computation of a layernorm operation
|
||||
|
||||
GemmLayernorm example = GEMM0 with partial reduction fused in epilogue (EpilogueVisitorLayerNorm)
|
||||
+ lightweight full reduction kernel (ApplyFinalReduction)
|
||||
+ GEMM1 with elementwise operations fused in mainloop (GemmLayernormMainloopFusion)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/arch/memory.h"
|
||||
#include "cutlass/arch/memory_sm75.h"
|
||||
#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
|
||||
#include "cutlass/gemm/kernel/default_gemm.h"
|
||||
#include "cutlass/gemm/kernel/default_gemm_complex.h"
|
||||
#include "cutlass/gemm/device/default_gemm_configuration.h"
|
||||
#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
namespace cutlass {
|
||||
namespace epilogue {
|
||||
namespace threadblock {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <
|
||||
typename ThreadblockShape_,
|
||||
int ThreadCount,
|
||||
typename OutputTileIterator_,
|
||||
typename AccumulatorTile_,
|
||||
typename ElementAccumulator_,
|
||||
typename ElementVariance_,
|
||||
typename ElementMean_,
|
||||
typename ElementLayernormCompute_,
|
||||
typename ElementwiseFunctor_,
|
||||
bool IsShiftedVariance_ = false
|
||||
>
|
||||
class EpilogueVisitorLayerNorm {
|
||||
public:
|
||||
|
||||
using ElementVariance = ElementVariance_;
|
||||
using ElementMean = ElementMean_;
|
||||
using ElementLayernormCompute = ElementLayernormCompute_;
|
||||
|
||||
using AccumulatorTile = AccumulatorTile_;
|
||||
|
||||
using ThreadblockShape = ThreadblockShape_;
|
||||
static int const kThreadCount = ThreadCount;
|
||||
|
||||
using OutputTileIterator = OutputTileIterator_;
|
||||
using ElementwiseFunctor = ElementwiseFunctor_;
|
||||
|
||||
static int const kIterations = OutputTileIterator::kIterations;
|
||||
static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
|
||||
static int const kRowIterations = OutputTileIterator::ThreadMap::Iterations::kRow;
|
||||
|
||||
static int const kThreads = OutputTileIterator::ThreadMap::kThreads;
|
||||
|
||||
static bool const kIsShiftedVariance = IsShiftedVariance_;
|
||||
|
||||
using ElementOutput = typename OutputTileIterator::Element;
|
||||
|
||||
static int const kDeltaRow = OutputTileIterator::ThreadMap::Delta::kRow;
|
||||
|
||||
/// Array type used in Shift-K Layernorm
|
||||
static int const kRowAccessCount = kIterations * kRowIterations;
|
||||
|
||||
using ConvertedShiftFragment = Array<ElementLayernormCompute, kRowAccessCount>;
|
||||
|
||||
// Conducts manual transpose externally (already supported) for column major
|
||||
using LayoutOutput = cutlass::layout::RowMajor;
|
||||
|
||||
using ElementAccumulator = ElementAccumulator_;
|
||||
|
||||
using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
|
||||
using LayernormFragment = Array<ElementLayernormCompute, kElementsPerAccess>;
|
||||
using OutputVector = Array<ElementOutput, kElementsPerAccess>;
|
||||
using TensorRefD = TensorRef<ElementOutput, LayoutOutput>;
|
||||
|
||||
static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
|
||||
static int const kThreadsInColumn = kThreads / kThreadsPerRow;
|
||||
static int const kHalfThreadsPerRow = (kThreadsPerRow >> 1);
|
||||
|
||||
/// Argument structure
|
||||
struct Arguments {
|
||||
|
||||
typename ElementwiseFunctor::Params elementwise;
|
||||
ElementVariance *ptr_Variance;
|
||||
ElementMean *ptr_Mean;
|
||||
ElementOutput *ptr_Shifted_K;
|
||||
MatrixCoord extent;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
Arguments():
|
||||
ptr_Variance(nullptr),
|
||||
ptr_Mean(nullptr),
|
||||
ptr_Shifted_K(nullptr)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
Arguments(
|
||||
typename ElementwiseFunctor::Params elementwise_,
|
||||
ElementVariance *ptr_Variance,
|
||||
ElementMean *ptr_Mean_,
|
||||
ElementOutput *ptr_Shifted_K_ = nullptr,
|
||||
MatrixCoord extent = MatrixCoord(0, 0)
|
||||
):
|
||||
elementwise(elementwise_),
|
||||
ptr_Variance(ptr_Variance),
|
||||
ptr_Mean(ptr_Mean_),
|
||||
ptr_Shifted_K(ptr_Shifted_K_),
|
||||
extent(extent)
|
||||
{
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
struct Params {
|
||||
|
||||
typename ElementwiseFunctor::Params elementwise;
|
||||
ElementVariance *ptr_Variance;
|
||||
ElementMean *ptr_Mean;
|
||||
ElementOutput *ptr_Shifted_K;
|
||||
MatrixCoord extent;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params():
|
||||
ptr_Variance(nullptr),
|
||||
ptr_Mean(nullptr)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(Arguments const &args):
|
||||
elementwise(args.elementwise),
|
||||
ptr_Variance(args.ptr_Variance),
|
||||
ptr_Mean(args.ptr_Mean),
|
||||
ptr_Shifted_K(args.ptr_Shifted_K),
|
||||
extent(args.extent)
|
||||
{
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
/// Shared storage
|
||||
struct SharedStorage {
|
||||
|
||||
};
|
||||
|
||||
private:
|
||||
|
||||
Params const & params_;
|
||||
SharedStorage & shared_storage_;
|
||||
MatrixCoord extent_;
|
||||
ElementwiseFunctor elementwise_;
|
||||
|
||||
OutputTileIterator iterator_C_;
|
||||
OutputTileIterator iterator_D_;
|
||||
typename OutputTileIterator::Fragment fragment_C_;
|
||||
typename OutputTileIterator::Fragment fragment_D_;
|
||||
|
||||
ElementAccumulator alpha_;
|
||||
ElementAccumulator beta_;
|
||||
ConvertedShiftFragment shift_k_frag_;
|
||||
|
||||
ElementLayernormCompute accum_sum_square_;
|
||||
ElementLayernormCompute accum_sum_element_;
|
||||
int thread_idx_;
|
||||
|
||||
MatrixCoord thread_offset_;
|
||||
|
||||
gemm::GemmCoord threadblock_tile_offset_;
|
||||
|
||||
public:
|
||||
|
||||
CUTLASS_DEVICE
|
||||
EpilogueVisitorLayerNorm(
|
||||
Params const ¶ms, ///< Parameters routed to the epilogue
|
||||
SharedStorage &shared_storage, ///< Shared storage needed by the functors here
|
||||
MatrixCoord threadblock_offset,
|
||||
gemm::GemmCoord threadblock_tile_offset,
|
||||
int thread_idx,
|
||||
OutputTileIterator destination_iterator, ///< Tile iterator for destination
|
||||
OutputTileIterator source_iterator ///< Threadblock tile coordinate in GEMMM
|
||||
):
|
||||
params_(params),
|
||||
shared_storage_(shared_storage),
|
||||
elementwise_(params.elementwise),
|
||||
extent_(params.extent),
|
||||
iterator_C_(source_iterator),
|
||||
iterator_D_(destination_iterator),
|
||||
threadblock_tile_offset_(threadblock_tile_offset),
|
||||
thread_idx_(thread_idx)
|
||||
{
|
||||
alpha_ = (params.elementwise.alpha_ptr ? *params.elementwise.alpha_ptr : params.elementwise.alpha);
|
||||
beta_ = (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
|
||||
|
||||
if (beta_ == ElementAccumulator()) {
|
||||
iterator_C_.clear_mask();
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper to indicate split-K behavior
|
||||
CUTLASS_DEVICE
|
||||
void set_k_partition(
|
||||
int split_k_index, ///< Index of this threadblock within split-K partitioned scheme
|
||||
int split_k_slices) { ///< Total number of split-K slices
|
||||
|
||||
}
|
||||
|
||||
/// Called to set the batch index
|
||||
CUTLASS_DEVICE
|
||||
void set_batch_index(int batch_idx) {
|
||||
|
||||
}
|
||||
|
||||
/// Called at the start of the epilogue just before iterating over accumulator slices
|
||||
CUTLASS_DEVICE
|
||||
void begin_epilogue() {
|
||||
|
||||
// If shift-K feature is enabled, we load shift-k fragment
|
||||
// at the very beginning of an epilogue
|
||||
if (kIsShiftedVariance && params_.ptr_Shifted_K != nullptr) {
|
||||
shift_k_frag_.clear();
|
||||
int thread_offset_row_base = iterator_D_.thread_start_row();
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int iter_idx = 0; iter_idx < kIterations; ++iter_idx) {
|
||||
int step_offset = iter_idx * OutputTileIterator::Shape::kRow;
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int rid = 0; rid < kRowIterations; ++rid) {
|
||||
int row_step_offset = rid * kDeltaRow;
|
||||
int row_offset = thread_offset_row_base + step_offset + row_step_offset;
|
||||
bool is_load = (row_offset < extent_.row());
|
||||
shift_k_frag_[iter_idx * kRowIterations + rid] = load_shift_k_(row_offset, is_load);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Called at the start of one step before starting accumulator exchange
|
||||
CUTLASS_DEVICE
|
||||
void begin_step(int step_idx) {
|
||||
fragment_D_.clear();
|
||||
|
||||
if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
|
||||
fragment_C_.clear();
|
||||
iterator_C_.load(fragment_C_);
|
||||
++iterator_C_;
|
||||
}
|
||||
}
|
||||
|
||||
/// Called at the start of a row
|
||||
CUTLASS_DEVICE
|
||||
void begin_row(int row_idx) {
|
||||
/// set the accumulator to 0
|
||||
accum_sum_element_ = ElementLayernormCompute(0);
|
||||
accum_sum_square_ = ElementLayernormCompute(0);
|
||||
}
|
||||
|
||||
/// Called after accumulators have been exchanged for each accumulator vector
|
||||
CUTLASS_DEVICE
|
||||
void visit(
|
||||
int iter_idx,
|
||||
int row_idx,
|
||||
int column_idx,
|
||||
int frag_idx,
|
||||
AccumulatorFragment const &accum) {
|
||||
|
||||
using Mul = cutlass::multiplies<ElementLayernormCompute>;
|
||||
using Minus = cutlass::minus<ElementLayernormCompute>;
|
||||
using Exp = cutlass::fast_exp_op<ElementLayernormCompute>;
|
||||
|
||||
Minus minus;
|
||||
Mul mul;
|
||||
Exp exponential;
|
||||
|
||||
LayernormFragment result;
|
||||
|
||||
thread_offset_ =
|
||||
iterator_D_.thread_start() +
|
||||
OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
|
||||
|
||||
NumericArrayConverter<ElementLayernormCompute, ElementOutput, kElementsPerAccess> source_converter;
|
||||
OutputVector &source_vector = reinterpret_cast<OutputVector *>(&fragment_C_)[frag_idx];
|
||||
|
||||
bool column_guard = (thread_offset_.column() < extent_.column());
|
||||
|
||||
if (elementwise_.kScale == cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
|
||||
result = source_converter(elementwise_(accum));
|
||||
}else{
|
||||
result = source_converter(elementwise_(accum, source_vector));
|
||||
}
|
||||
|
||||
|
||||
ElementLayernormCompute inv_scalar = cutlass::constants::one<ElementLayernormCompute>() / ElementLayernormCompute(extent_.column());
|
||||
|
||||
// Fragment is cleared for non-reachable columns so no need to check against column guard
|
||||
ElementLayernormCompute accum_sum_element_tmp = element_sum_accumulator_(result);
|
||||
|
||||
// Square sum is different. Non-reachable columns should've been computed for shift-k
|
||||
// Otherwise we will incorrectly have some extra k^2 added into square sum.
|
||||
ElementLayernormCompute accum_sum_square_tmp = ElementLayernormCompute(0);
|
||||
|
||||
if (column_guard) {
|
||||
accum_sum_square_tmp = (kIsShiftedVariance) ? \
|
||||
square_sum_accumulator_(result, shift_k_frag_[iter_idx * kRowIterations + row_idx]) : \
|
||||
square_sum_accumulator_(result);
|
||||
}
|
||||
|
||||
accum_sum_element_tmp *= inv_scalar;
|
||||
accum_sum_square_tmp *= inv_scalar;
|
||||
|
||||
// After performing the in-thread reduction, we then perform cross-thread / in-warp reduction
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = kHalfThreadsPerRow; i > 0; i >>= 1) {
|
||||
accum_sum_element_tmp += __shfl_xor_sync(0xFFFFFFFF, accum_sum_element_tmp, i);
|
||||
accum_sum_square_tmp += __shfl_xor_sync(0xFFFFFFFF, accum_sum_square_tmp, i);
|
||||
}
|
||||
accum_sum_element_ += accum_sum_element_tmp;
|
||||
accum_sum_square_ += accum_sum_square_tmp;
|
||||
|
||||
// Convert to the output
|
||||
NumericArrayConverter<ElementOutput, ElementLayernormCompute, kElementsPerAccess> output_converter;
|
||||
OutputVector &output = reinterpret_cast<OutputVector *>(&fragment_D_)[frag_idx];
|
||||
output = output_converter(result);
|
||||
}
|
||||
|
||||
/// Called at the start of a row
|
||||
CUTLASS_DEVICE
|
||||
void end_row(int row_idx) {
|
||||
|
||||
using ConvertVarianceOutput = cutlass::NumericConverter<ElementVariance, ElementLayernormCompute>;
|
||||
using ConvertMeanOutput = cutlass::NumericConverter<ElementMean, ElementLayernormCompute>;
|
||||
|
||||
ConvertVarianceOutput convert_variance_output;
|
||||
ConvertMeanOutput convert_mean_output;
|
||||
|
||||
bool is_write_thread = (thread_offset_.row() < extent_.row() && (threadIdx.x % kThreadsPerRow) == 0);
|
||||
int row_offset = thread_offset_.row() + threadblock_tile_offset_.n() * extent_.row();
|
||||
|
||||
ElementVariance *curr_ptr_sum_square = params_.ptr_Variance + row_offset;
|
||||
ElementMean *curr_ptr_element_sum = params_.ptr_Mean + row_offset;
|
||||
|
||||
arch::global_store<ElementVariance, sizeof(ElementVariance)>(
|
||||
convert_variance_output(accum_sum_square_),
|
||||
(void *)curr_ptr_sum_square,
|
||||
is_write_thread);
|
||||
|
||||
arch::global_store<ElementMean, sizeof(ElementMean)>(
|
||||
convert_mean_output(accum_sum_element_),
|
||||
(void *)curr_ptr_element_sum,
|
||||
is_write_thread);
|
||||
}
|
||||
|
||||
/// Called after all accumulator elements have been visited
|
||||
CUTLASS_DEVICE
|
||||
void end_step(int step_idx) {
|
||||
|
||||
iterator_D_.store(fragment_D_);
|
||||
++iterator_D_;
|
||||
}
|
||||
|
||||
/// Called after all steps have been completed
|
||||
CUTLASS_DEVICE
|
||||
void end_epilogue() {
|
||||
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
CUTLASS_DEVICE
|
||||
ElementLayernormCompute load_shift_k_(int row_offset, bool is_load) {
|
||||
using ConvertShiftK = cutlass::NumericConverter<ElementLayernormCompute, ElementOutput>;
|
||||
ConvertShiftK convert_shift_k;
|
||||
ElementOutput shift_k_val;
|
||||
|
||||
// Computes the address to load shift_k element
|
||||
ElementOutput *curr_ptr_shift_k = params_.ptr_Shifted_K + row_offset;
|
||||
// Conditionally loads from global memory
|
||||
arch::global_load<ElementOutput, sizeof(ElementOutput)>(shift_k_val, (void *)curr_ptr_shift_k, is_load);
|
||||
// Converts data type to return
|
||||
ElementLayernormCompute converted_shift_k_val = convert_shift_k(shift_k_val);
|
||||
|
||||
return converted_shift_k_val;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
ElementLayernormCompute square_sum_accumulator_(LayernormFragment const &accum) {
|
||||
ElementLayernormCompute sum_ = ElementLayernormCompute(0);
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < LayernormFragment::kElements; ++i) {
|
||||
auto accum_ = accum[i];
|
||||
sum_ += accum_ * accum_;
|
||||
}
|
||||
|
||||
return sum_;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
ElementLayernormCompute square_sum_accumulator_(LayernormFragment const &accum, ElementLayernormCompute shift_k_val) {
|
||||
ElementLayernormCompute sum_ = ElementLayernormCompute(0);
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < LayernormFragment::kElements; ++i) {
|
||||
auto accum_ = accum[i] - shift_k_val;
|
||||
sum_ += accum_ * accum_;
|
||||
}
|
||||
|
||||
return sum_;
|
||||
}
|
||||
|
||||
CUTLASS_DEVICE
|
||||
ElementLayernormCompute element_sum_accumulator_(LayernormFragment const &accum) {
|
||||
ElementLayernormCompute sum_ = ElementLayernormCompute(0);
|
||||
|
||||
CUTLASS_PRAGMA_UNROLL
|
||||
for (int i = 0; i < LayernormFragment::kElements; ++i) {
|
||||
sum_ += accum[i];
|
||||
}
|
||||
|
||||
return sum_;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,77 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind gemm related enum types to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cutlass/gemm/gemm.h"
|
||||
#include "host.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
void bind_gemm(py::module &m) {
|
||||
//
|
||||
// Enumerate types
|
||||
// cutlass/gemm/gemm.h
|
||||
|
||||
py::enum_<cutlass::gemm::GemmUniversalMode>(m, "Mode")
|
||||
.value("Gemm", cutlass::gemm::GemmUniversalMode::kGemm, "Ordinary GEMM & GEMM Split-K serial")
|
||||
.value("GemmSplitKParallel", cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel, "GEMM Split-K parallel")
|
||||
.value("Batched", cutlass::gemm::GemmUniversalMode::kBatched, "Batched GEMM")
|
||||
.value("Array", cutlass::gemm::GemmUniversalMode::kArray)
|
||||
.value("Invalid", cutlass::gemm::GemmUniversalMode::kInvalid);
|
||||
|
||||
/// GemmCoord is a structure that specifies a location within the coordinate space of a GEMM problem
|
||||
py::class_<cutlass::gemm::GemmCoord>(m, "GemmCoord")
|
||||
.def(py::init<int, int, int>())
|
||||
.def("m", py::overload_cast<>(&cutlass::gemm::GemmCoord::m))
|
||||
.def("n", py::overload_cast<>(&cutlass::gemm::GemmCoord::n))
|
||||
.def("k", py::overload_cast<>(&cutlass::gemm::GemmCoord::k))
|
||||
// get tensor coords
|
||||
.def("mk",
|
||||
[](const cutlass::gemm::GemmCoord & problem_size) {
|
||||
return cutlass::MatrixCoord(problem_size.mk());
|
||||
})
|
||||
.def("kn",
|
||||
[](const cutlass::gemm::GemmCoord & problem_size) {
|
||||
return cutlass::MatrixCoord(problem_size.kn());
|
||||
})
|
||||
.def("mn",
|
||||
[](const cutlass::gemm::GemmCoord & problem_size) {
|
||||
return cutlass::MatrixCoord(problem_size.mn());
|
||||
});
|
||||
|
||||
py::module_ host_submodule = m.def_submodule("host");
|
||||
bind_gemm_host_helper(host_submodule);
|
||||
}
|
||||
@ -1,628 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
/*! \file
|
||||
\brief
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/fast_math.h"
|
||||
#include "cutlass/gemm/gemm.h"
|
||||
#include "cutlass/gemm/kernel/params_universal_base.h"
|
||||
#include "cutlass/matrix_coord.h"
|
||||
#include "cutlass/complex.h"
|
||||
#include "cutlass/semaphore.h"
|
||||
|
||||
#include "cutlass/layout/matrix.h"
|
||||
|
||||
#include "cutlass/trace.h"
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace cutlass {
|
||||
namespace gemm {
|
||||
namespace kernel {
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <
|
||||
typename Mma_, ///! Threadblock-scoped matrix multiply-accumulate
|
||||
typename Epilogue_, ///! Epilogue
|
||||
typename ThreadblockSwizzle_ ///! Threadblock swizzling function
|
||||
>
|
||||
struct GemmUniversalwithEpilogueVisitor {
|
||||
public:
|
||||
|
||||
using Mma = Mma_;
|
||||
using Epilogue = Epilogue_;
|
||||
using EpilogueVisitor = typename Epilogue::Visitor;
|
||||
using ThreadblockSwizzle = ThreadblockSwizzle_;
|
||||
|
||||
using ElementA = typename Mma::IteratorA::Element;
|
||||
using LayoutA = typename Mma::IteratorA::Layout;
|
||||
using ElementB = typename Mma::IteratorB::Element;
|
||||
using LayoutB = typename Mma::IteratorB::Layout;
|
||||
using ElementC = typename EpilogueVisitor::ElementOutput;
|
||||
using LayoutC = typename EpilogueVisitor::OutputTileIterator::Layout;
|
||||
|
||||
static ComplexTransform const kTransformA = Mma::kTransformA;
|
||||
static ComplexTransform const kTransformB = Mma::kTransformB;
|
||||
using Operator = typename Mma::Operator;
|
||||
|
||||
using OperatorClass = typename Mma::Operator::OperatorClass;
|
||||
using ThreadblockShape = typename Mma::Shape;
|
||||
using WarpShape = typename Mma::Operator::Shape;
|
||||
using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
|
||||
using ArchTag = typename Mma::ArchTag;
|
||||
|
||||
static int const kStages = Mma::kStages;
|
||||
static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
|
||||
static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
|
||||
static int const kAlignmentC = EpilogueVisitor::kElementsPerAccess;
|
||||
|
||||
/// Warp count (concept: GemmShape)
|
||||
using WarpCount = typename Mma::WarpCount;
|
||||
static int const kThreadCount = 32 * WarpCount::kCount;
|
||||
|
||||
/// Split-K preserves splits that are 128b aligned
|
||||
static int const kSplitKAlignment = const_max(
|
||||
128 / sizeof_bits<ElementA>::value,
|
||||
128 / sizeof_bits<ElementB>::value
|
||||
);
|
||||
|
||||
//
|
||||
// Structures
|
||||
//
|
||||
|
||||
/// Argument structure
|
||||
struct Arguments : UniversalArgumentsBase {
|
||||
|
||||
//
|
||||
// Data members
|
||||
//
|
||||
|
||||
typename EpilogueVisitor::Arguments epilogue_visitor;
|
||||
|
||||
void const * ptr_A;
|
||||
void const * ptr_B;
|
||||
void const * ptr_C;
|
||||
void * ptr_D;
|
||||
|
||||
int64_t batch_stride_A;
|
||||
int64_t batch_stride_B;
|
||||
int64_t batch_stride_C;
|
||||
|
||||
typename LayoutA::Stride stride_a;
|
||||
typename LayoutB::Stride stride_b;
|
||||
typename LayoutC::Stride stride_c;
|
||||
typename LayoutC::Stride stride_d;
|
||||
|
||||
typename LayoutA::Stride::LongIndex lda;
|
||||
typename LayoutB::Stride::LongIndex ldb;
|
||||
typename LayoutC::Stride::LongIndex ldc;
|
||||
typename LayoutC::Stride::LongIndex ldd;
|
||||
|
||||
int const * ptr_gather_A_indices;
|
||||
int const * ptr_gather_B_indices;
|
||||
int const * ptr_scatter_D_indices;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
Arguments():
|
||||
ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr),
|
||||
ptr_gather_A_indices(nullptr),
|
||||
ptr_gather_B_indices(nullptr),
|
||||
ptr_scatter_D_indices(nullptr) {}
|
||||
|
||||
/// constructs an arguments structure
|
||||
Arguments(
|
||||
GemmUniversalMode mode,
|
||||
GemmCoord problem_size,
|
||||
int batch_count,
|
||||
typename EpilogueVisitor::Arguments epilogue_visitor,
|
||||
void const * ptr_A,
|
||||
void const * ptr_B,
|
||||
void const * ptr_C,
|
||||
void * ptr_D,
|
||||
int64_t batch_stride_A,
|
||||
int64_t batch_stride_B,
|
||||
int64_t batch_stride_C,
|
||||
int64_t batch_stride_D,
|
||||
typename LayoutA::Stride stride_a,
|
||||
typename LayoutB::Stride stride_b,
|
||||
typename LayoutC::Stride stride_c,
|
||||
typename LayoutC::Stride stride_d,
|
||||
int const *ptr_gather_A_indices = nullptr,
|
||||
int const *ptr_gather_B_indices = nullptr,
|
||||
int const *ptr_scatter_D_indices = nullptr
|
||||
):
|
||||
UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
|
||||
epilogue_visitor(epilogue_visitor),
|
||||
ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
|
||||
batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
|
||||
stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
|
||||
ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
|
||||
ptr_scatter_D_indices(ptr_scatter_D_indices) {
|
||||
lda = 0;
|
||||
ldb = 0;
|
||||
ldc = 0;
|
||||
ldd = 0;
|
||||
CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
|
||||
}
|
||||
|
||||
/// constructs an arguments structure
|
||||
Arguments(
|
||||
GemmUniversalMode mode,
|
||||
GemmCoord problem_size,
|
||||
int batch_count,
|
||||
typename EpilogueVisitor::Arguments epilogue_visitor,
|
||||
void const * ptr_A,
|
||||
void const * ptr_B,
|
||||
void const * ptr_C,
|
||||
void * ptr_D,
|
||||
int64_t batch_stride_A,
|
||||
int64_t batch_stride_B,
|
||||
int64_t batch_stride_C,
|
||||
int64_t batch_stride_D,
|
||||
typename LayoutA::Stride::LongIndex lda,
|
||||
typename LayoutB::Stride::LongIndex ldb,
|
||||
typename LayoutC::Stride::LongIndex ldc,
|
||||
typename LayoutC::Stride::LongIndex ldd,
|
||||
int const *ptr_gather_A_indices = nullptr,
|
||||
int const *ptr_gather_B_indices = nullptr,
|
||||
int const *ptr_scatter_D_indices = nullptr
|
||||
):
|
||||
UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
|
||||
epilogue_visitor(epilogue_visitor),
|
||||
ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
|
||||
batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
|
||||
lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
|
||||
ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
|
||||
ptr_scatter_D_indices(ptr_scatter_D_indices) {
|
||||
stride_a = make_Coord(lda);
|
||||
stride_b = make_Coord(ldb);
|
||||
stride_c = make_Coord(ldc);
|
||||
stride_d = make_Coord(ldd);
|
||||
CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
|
||||
}
|
||||
|
||||
/// Returns arguments for the transposed problem
|
||||
Arguments transposed_problem() const {
|
||||
Arguments args(*this);
|
||||
|
||||
std::swap(args.problem_size.m(), args.problem_size.n());
|
||||
std::swap(args.ptr_A, args.ptr_B);
|
||||
std::swap(args.lda, args.ldb);
|
||||
std::swap(args.stride_a, args.stride_b);
|
||||
std::swap(args.batch_stride_A, args.batch_stride_B);
|
||||
std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
|
||||
|
||||
return args;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// Structure for precomputing values in host memory and passing to kernels
|
||||
//
|
||||
|
||||
/// Parameters structure
|
||||
struct Params : UniversalParamsBase<
|
||||
ThreadblockSwizzle,
|
||||
ThreadblockShape,
|
||||
ElementA,
|
||||
ElementB,
|
||||
ElementC> {
|
||||
|
||||
using ParamsBase = UniversalParamsBase<
|
||||
ThreadblockSwizzle,
|
||||
ThreadblockShape,
|
||||
ElementA,
|
||||
ElementB,
|
||||
ElementC>;
|
||||
|
||||
typename Mma::IteratorA::Params params_A;
|
||||
typename Mma::IteratorB::Params params_B;
|
||||
typename EpilogueVisitor::OutputTileIterator::Params params_C;
|
||||
typename EpilogueVisitor::OutputTileIterator::Params params_D;
|
||||
|
||||
typename EpilogueVisitor::Params epilogue_visitor;
|
||||
|
||||
void * ptr_A;
|
||||
void * ptr_B;
|
||||
void * ptr_C;
|
||||
void * ptr_D;
|
||||
|
||||
int64_t batch_stride_A;
|
||||
int64_t batch_stride_B;
|
||||
int64_t batch_stride_C;
|
||||
|
||||
int * ptr_gather_A_indices;
|
||||
int * ptr_gather_B_indices;
|
||||
int * ptr_scatter_D_indices;
|
||||
|
||||
int *semaphore;
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
/// Default constructor
|
||||
Params() = default;
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
Params(
|
||||
Arguments const &args,
|
||||
int device_sms,
|
||||
int sm_occupancy
|
||||
):
|
||||
ParamsBase(args, device_sms, sm_occupancy),
|
||||
params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
|
||||
params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
|
||||
params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
|
||||
params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
|
||||
epilogue_visitor(args.epilogue_visitor),
|
||||
ptr_A(const_cast<void *>(args.ptr_A)),
|
||||
ptr_B(const_cast<void *>(args.ptr_B)),
|
||||
ptr_C(const_cast<void *>(args.ptr_C)),
|
||||
ptr_D(args.ptr_D),
|
||||
batch_stride_A(args.batch_stride_A),
|
||||
batch_stride_B(args.batch_stride_B),
|
||||
batch_stride_C(args.batch_stride_C),
|
||||
ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
|
||||
ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
|
||||
ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices)) {
|
||||
|
||||
}
|
||||
|
||||
CUTLASS_HOST_DEVICE
|
||||
void update(
|
||||
Arguments const &args,
|
||||
void *workspace = nullptr) {
|
||||
|
||||
ptr_A = const_cast<void *>(args.ptr_A);
|
||||
ptr_B = const_cast<void *>(args.ptr_B);
|
||||
ptr_C = const_cast<void *>(args.ptr_C);
|
||||
ptr_D = args.ptr_D;
|
||||
|
||||
ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
|
||||
ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
|
||||
ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
|
||||
|
||||
batch_stride_A = args.batch_stride_A;
|
||||
batch_stride_B = args.batch_stride_B;
|
||||
batch_stride_C = args.batch_stride_C;
|
||||
|
||||
epilogue_visitor = args.epilogue_visitor;
|
||||
|
||||
semaphore = static_cast<int *>(workspace);
|
||||
CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
|
||||
}
|
||||
};
|
||||
|
||||
/// Shared memory storage structure
|
||||
union SharedStorage {
|
||||
typename Mma::SharedStorage main_loop;
|
||||
typename Epilogue::SharedStorage epilogue;
|
||||
typename EpilogueVisitor::SharedStorage visitor;
|
||||
};
|
||||
|
||||
public:
|
||||
|
||||
//
|
||||
// Methods
|
||||
//
|
||||
|
||||
CUTLASS_DEVICE
|
||||
GemmUniversalwithEpilogueVisitor() { }
|
||||
|
||||
/// Determines whether kernel satisfies alignment
|
||||
static Status can_implement(
|
||||
cutlass::gemm::GemmCoord const & problem_size) {
|
||||
|
||||
CUTLASS_TRACE_HOST("GemmUniversalwithEpilogueVisitor::can_implement()");
|
||||
|
||||
static int const kAlignmentA = (platform::is_same<LayoutA,
|
||||
layout::ColumnMajorInterleaved<32>>::value)
|
||||
? 32
|
||||
: (platform::is_same<LayoutA,
|
||||
layout::ColumnMajorInterleaved<64>>::value)
|
||||
? 64
|
||||
: Mma::IteratorA::AccessType::kElements;
|
||||
static int const kAlignmentB = (platform::is_same<LayoutB,
|
||||
layout::RowMajorInterleaved<32>>::value)
|
||||
? 32
|
||||
: (platform::is_same<LayoutB,
|
||||
layout::RowMajorInterleaved<64>>::value)
|
||||
? 64
|
||||
: Mma::IteratorB::AccessType::kElements;
|
||||
static int const kAlignmentC = (platform::is_same<LayoutC,
|
||||
layout::ColumnMajorInterleaved<32>>::value)
|
||||
? 32
|
||||
: (platform::is_same<LayoutC,
|
||||
layout::ColumnMajorInterleaved<64>>::value)
|
||||
? 64
|
||||
: Epilogue::OutputTileIterator::kElementsPerAccess;
|
||||
|
||||
bool isAMisaligned = false;
|
||||
bool isBMisaligned = false;
|
||||
bool isCMisaligned = false;
|
||||
|
||||
if (platform::is_same<LayoutA, layout::RowMajor>::value) {
|
||||
isAMisaligned = problem_size.k() % kAlignmentA;
|
||||
} else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
|
||||
isAMisaligned = problem_size.m() % kAlignmentA;
|
||||
} else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
|
||||
|| platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
|
||||
isAMisaligned = problem_size.k() % kAlignmentA;
|
||||
}
|
||||
|
||||
if (platform::is_same<LayoutB, layout::RowMajor>::value) {
|
||||
isBMisaligned = problem_size.n() % kAlignmentB;
|
||||
} else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
|
||||
isBMisaligned = problem_size.k() % kAlignmentB;
|
||||
} else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
|
||||
|| platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
|
||||
isBMisaligned = problem_size.k() % kAlignmentB;
|
||||
}
|
||||
|
||||
if (platform::is_same<LayoutC, layout::RowMajor>::value) {
|
||||
isCMisaligned = problem_size.n() % kAlignmentC;
|
||||
} else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
|
||||
isCMisaligned = problem_size.m() % kAlignmentC;
|
||||
} else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
|
||||
|| platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
|
||||
isCMisaligned = problem_size.n() % kAlignmentC;
|
||||
}
|
||||
|
||||
if (isAMisaligned) {
|
||||
CUTLASS_TRACE_HOST(" returning kErrorMisalignedOperand for A operand");
|
||||
return Status::kErrorMisalignedOperand;
|
||||
}
|
||||
|
||||
if (isBMisaligned) {
|
||||
CUTLASS_TRACE_HOST(" returning kErrorMisalignedOperand for B operand");
|
||||
return Status::kErrorMisalignedOperand;
|
||||
}
|
||||
|
||||
if (isCMisaligned) {
|
||||
CUTLASS_TRACE_HOST(" returning kErrorMisalignedOperand for C operand");
|
||||
return Status::kErrorMisalignedOperand;
|
||||
}
|
||||
|
||||
CUTLASS_TRACE_HOST(" returning kSuccess");
|
||||
|
||||
return Status::kSuccess;
|
||||
}
|
||||
|
||||
static Status can_implement(Arguments const &args) {
|
||||
return can_implement(args.problem_size);
|
||||
}
|
||||
|
||||
/// Executes one GEMM
|
||||
CUTLASS_DEVICE
|
||||
void operator()(Params const ¶ms, SharedStorage &shared_storage) {
|
||||
|
||||
// Compute threadblock location
|
||||
ThreadblockSwizzle threadblock_swizzle;
|
||||
|
||||
cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
|
||||
|
||||
// Early exit if CTA is out of range
|
||||
if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
|
||||
params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
int offset_k = 0;
|
||||
int problem_size_k = params.problem_size.k();
|
||||
|
||||
ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
|
||||
ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
|
||||
|
||||
//
|
||||
// Fetch pointers based on mode.
|
||||
//
|
||||
if (params.mode == GemmUniversalMode::kGemm ||
|
||||
params.mode == GemmUniversalMode::kGemmSplitKParallel) {
|
||||
|
||||
if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
|
||||
|
||||
problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
|
||||
}
|
||||
|
||||
offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
|
||||
}
|
||||
else if (params.mode == GemmUniversalMode::kBatched) {
|
||||
ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
|
||||
ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
|
||||
}
|
||||
else if (params.mode == GemmUniversalMode::kArray) {
|
||||
ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
|
||||
ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Compute initial location in logical coordinates
|
||||
cutlass::MatrixCoord tb_offset_A{
|
||||
threadblock_tile_offset.m() * Mma::Shape::kM,
|
||||
offset_k,
|
||||
};
|
||||
|
||||
cutlass::MatrixCoord tb_offset_B{
|
||||
offset_k,
|
||||
threadblock_tile_offset.n() * Mma::Shape::kN
|
||||
};
|
||||
|
||||
// Compute position within threadblock
|
||||
int thread_idx = threadIdx.x;
|
||||
|
||||
// Construct iterators to A and B operands
|
||||
typename Mma::IteratorA iterator_A(
|
||||
params.params_A,
|
||||
ptr_A,
|
||||
{params.problem_size.m(), problem_size_k},
|
||||
thread_idx,
|
||||
tb_offset_A,
|
||||
params.ptr_gather_A_indices);
|
||||
|
||||
typename Mma::IteratorB iterator_B(
|
||||
params.params_B,
|
||||
ptr_B,
|
||||
{problem_size_k, params.problem_size.n()},
|
||||
thread_idx,
|
||||
tb_offset_B,
|
||||
params.ptr_gather_B_indices);
|
||||
|
||||
// Broadcast the warp_id computed by lane 0 to ensure dependent code
|
||||
// is compiled as warp-uniform.
|
||||
int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
|
||||
|
||||
int lane_idx = threadIdx.x % 32;
|
||||
|
||||
//
|
||||
// Main loop
|
||||
//
|
||||
|
||||
// Construct thread-scoped matrix multiply
|
||||
Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
|
||||
|
||||
typename Mma::FragmentC accumulators;
|
||||
|
||||
accumulators.clear();
|
||||
|
||||
// Compute threadblock-scoped matrix multiply-add
|
||||
int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
|
||||
|
||||
// Compute threadblock-scoped matrix multiply-add
|
||||
mma(
|
||||
gemm_k_iterations,
|
||||
accumulators,
|
||||
iterator_A,
|
||||
iterator_B,
|
||||
accumulators);
|
||||
|
||||
//
|
||||
// Epilogue
|
||||
//
|
||||
|
||||
// EpilogueOutputOp output_op(params.output_op);
|
||||
|
||||
//
|
||||
// Masked tile iterators constructed from members
|
||||
//
|
||||
|
||||
threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
|
||||
|
||||
//assume identity swizzle
|
||||
MatrixCoord threadblock_offset(
|
||||
threadblock_tile_offset.m() * Mma::Shape::kM,
|
||||
threadblock_tile_offset.n() * Mma::Shape::kN
|
||||
);
|
||||
|
||||
int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
|
||||
|
||||
ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
|
||||
ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
|
||||
|
||||
//
|
||||
// Fetch pointers based on mode.
|
||||
//
|
||||
|
||||
// Construct the semaphore.
|
||||
Semaphore semaphore(params.semaphore + block_idx, thread_idx);
|
||||
|
||||
// Tile iterator loading from source tensor.
|
||||
|
||||
EpilogueVisitor epilogue_visitor(
|
||||
params.epilogue_visitor,
|
||||
shared_storage.visitor,
|
||||
threadblock_offset,
|
||||
threadblock_tile_offset,
|
||||
thread_idx,
|
||||
params.problem_size.mn()
|
||||
);
|
||||
|
||||
if (params.mode == GemmUniversalMode::kBatched || params.mode == GemmUniversalMode::kArray) {
|
||||
epilogue_visitor.set_batch_index(threadblock_tile_offset.k());
|
||||
}
|
||||
|
||||
Epilogue epilogue(
|
||||
shared_storage.epilogue,
|
||||
thread_idx,
|
||||
warp_idx,
|
||||
lane_idx);
|
||||
|
||||
// Wait on the semaphore - this latency may have been covered by iterator construction
|
||||
if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
|
||||
|
||||
// For subsequent threadblocks, the source matrix is held in the 'D' tensor.
|
||||
semaphore.wait(threadblock_tile_offset.k());
|
||||
}
|
||||
|
||||
|
||||
// Execute the epilogue operator to update the destination tensor.
|
||||
epilogue(epilogue_visitor, accumulators);
|
||||
|
||||
//
|
||||
// Release the semaphore
|
||||
//
|
||||
|
||||
if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
|
||||
|
||||
int lock = 0;
|
||||
if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
|
||||
|
||||
// The final threadblock resets the semaphore for subsequent grids.
|
||||
lock = 0;
|
||||
}
|
||||
else {
|
||||
// Otherwise, the semaphore is incremented
|
||||
lock = threadblock_tile_offset.k() + 1;
|
||||
}
|
||||
|
||||
semaphore.release(lock);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
} // namespace kernel
|
||||
} // namespace gemm
|
||||
} // namespace cutlass
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1,47 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind gemm host helpers to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cutlass/util/host_reorder.h"
|
||||
#include "cutlass/layout/tensor.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
|
||||
void bind_gemm_host_helper(py::module &m) {
|
||||
m.def("reorder_column", &cutlass::reorder_column<32, int8_t, cutlass::layout::RowMajorInterleaved<32>>);
|
||||
m.def("reorder_column", &cutlass::reorder_column<32, int8_t, cutlass::layout::ColumnMajorInterleaved<32>>);
|
||||
}
|
||||
@ -1,47 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind CUTLASS layouts to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "tensor.h"
|
||||
#include "matrix.h"
|
||||
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
void bind_layout(py::module &m) {
|
||||
bind_tensor_layout(m);
|
||||
bind_matrix_layout(m);
|
||||
}
|
||||
@ -1,87 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind Matrix layouts to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cutlass/layout/matrix.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
void bind_matrix_layout(py::module &m) {
|
||||
//
|
||||
// Matrix layouts
|
||||
// cutlass/layout/matrix.h
|
||||
//
|
||||
|
||||
py::class_<cutlass::layout::RowMajor>(m, "RowMajor", R"pbdoc(
|
||||
Mapping function for row-major matrices.
|
||||
)pbdoc")
|
||||
.def_static("packed", &cutlass::layout::RowMajor::packed,
|
||||
py::arg("extent"),
|
||||
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
|
||||
.def("stride", [](const cutlass::layout::RowMajor & layout){
|
||||
return layout.stride().at(0);
|
||||
}, R"pbdoc(Returns the stride of the layout)pbdoc");
|
||||
|
||||
py::class_<cutlass::layout::ColumnMajor>(m, "ColumnMajor", R"pbdoc(
|
||||
Mapping function for column-major matrices.
|
||||
)pbdoc")
|
||||
.def_static("packed", &cutlass::layout::ColumnMajor::packed,
|
||||
py::arg("extent"),
|
||||
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc" )
|
||||
.def("stride", [](const cutlass::layout::ColumnMajor & layout){
|
||||
return layout.stride().at(0);
|
||||
}, R"pbdoc(Returns the stride of the layout)pbdoc");
|
||||
|
||||
py::class_<cutlass::layout::RowMajorInterleaved<32>>(m, "RowMajorInterleaved32",
|
||||
R"pbdoc(Mapping function for interleaved matrices. Matrix is structured
|
||||
as row-major arrangement of fixed-size columns 32)pbdoc")
|
||||
.def_static("packed", &cutlass::layout::RowMajorInterleaved<32>::packed,
|
||||
py::arg("extent"),
|
||||
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
|
||||
.def("stride", [](const cutlass::layout::RowMajorInterleaved<32> & layout){
|
||||
return layout.stride().at(0);
|
||||
}, R"pbdoc(Returns the stride of the layout)pbdoc");
|
||||
|
||||
py::class_<cutlass::layout::ColumnMajorInterleaved<32>>(m, "ColumnMajorInterleaved32",
|
||||
R"pbdoc(Mapping function for interleaved matrices. Matrix is structured
|
||||
as column-major arrangement of fixed-size rows 32)pbdoc")
|
||||
.def_static("packed", &cutlass::layout::ColumnMajorInterleaved<32>::packed,
|
||||
py::arg("extent"),
|
||||
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
|
||||
.def("stride", [](const cutlass::layout::ColumnMajorInterleaved<32> & layout){
|
||||
return layout.stride().at(0);
|
||||
}, R"pbdoc(Returns the stride of the layout)pbdoc");
|
||||
}
|
||||
@ -1,74 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind Tensor layouts to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cutlass/layout/tensor.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
void bind_tensor_layout(py::module &m) {
|
||||
//
|
||||
// Tensor layouts
|
||||
// cutlass/include/cutlass/layout/tensor.h
|
||||
//
|
||||
|
||||
/// Mapping function for 4-D NHWC tensors.
|
||||
py::class_<cutlass::layout::TensorNHWC>(m, "TensorNHWC",
|
||||
R"pbdoc(Mapping function for 4-D NHWC tensors)pbdoc")
|
||||
.def_static("packed", &cutlass::layout::TensorNHWC::packed,
|
||||
py::arg("extent"),
|
||||
R"pbdoc(Helper returns a layout to a tightly packed NHWC tensor)pbdoc")
|
||||
.def("stride", py::overload_cast<>(&cutlass::layout::TensorNHWC::stride),
|
||||
R"pbdoc(Returns the stride of the layout)pbdoc");
|
||||
|
||||
/// Mapping function for 4-D NC/xHWx tensors.
|
||||
py::class_<cutlass::layout::TensorNCxHWx<32>>(m, "TensorNC32HW32",
|
||||
R"pbdoc(Mapping function for 4-D NC/32HW32 tensors)pbdoc")
|
||||
.def_static("packed", &cutlass::layout::TensorNCxHWx<32>::packed,
|
||||
py::arg("extent"),
|
||||
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
|
||||
.def("stride", py::overload_cast<>(&cutlass::layout::TensorNCxHWx<32>::stride),
|
||||
R"pbdoc(Returns the stride of the layout)pbdoc");
|
||||
|
||||
/// Mapping function for 4-D CxRSKx tensors.
|
||||
py::class_<cutlass::layout::TensorCxRSKx<32>>(m, "TensorC32RSK32",
|
||||
R"pbdoc(Mapping function for 4-D C32RSK32 tensors)pbdoc")
|
||||
.def_static("packed", &cutlass::layout::TensorCxRSKx<32>::packed,
|
||||
py::arg("extent"),
|
||||
R"pbdoc(Helper returns a layout to a tightly packed tensor)pbdoc")
|
||||
.def("stride", py::overload_cast<>(&cutlass::layout::TensorCxRSKx<32>::stride),
|
||||
R"pbdoc(Returns the stride of the layout)pbdoc");
|
||||
}
|
||||
@ -1,159 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind threadblock swizzling to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
|
||||
#include "cutlass/conv/threadblock/threadblock_swizzle.h"
|
||||
|
||||
#include <cxxabi.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
std::string demangle(const char* mangled_name) {
|
||||
std::size_t len = 0;
|
||||
int status = 0;
|
||||
std::unique_ptr<char> ptr(
|
||||
__cxxabiv1::__cxa_demangle(mangled_name, nullptr, &len, &status));
|
||||
return ptr.get();
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void bind_identity_swizzle(py::module & m, std::string name) {
|
||||
py::class_<T>(m, name.c_str(),
|
||||
R"pbdoc(Threadblock swizzling function for GEMMs)pbdoc")
|
||||
.def(py::init<>())
|
||||
.def("get_tiled_shape",
|
||||
py::overload_cast<cutlass::gemm::GemmCoord, cutlass::gemm::GemmCoord, int>(
|
||||
&T::get_tiled_shape, py::const_
|
||||
), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
|
||||
R"pbdoc(Returns the shape of the problem in units of logical tiles
|
||||
|
||||
:param problem_size: gemm(M, N, K)
|
||||
:type problem_size: :class:`cutlass.gemm.GemmCoord`
|
||||
)pbdoc")
|
||||
.def("get_tiled_shape",
|
||||
py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&, cutlass::gemm::GemmCoord, int>(
|
||||
&T::get_tiled_shape, py::const_
|
||||
), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
|
||||
R"pbdoc(Returns the shape of the problem in units of logical tiles
|
||||
|
||||
:param problem_size: Implicit gemm problem size conv_operator(NPQK, NHWC, KRSC)
|
||||
:type problem_size: :class:`cutlass.gemm.GemmCoord`)
|
||||
)pbdoc")
|
||||
.def("get_tiled_shape",
|
||||
py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv3dProblemSize&, cutlass::gemm::GemmCoord, int>(
|
||||
&T::get_tiled_shape, py::const_
|
||||
), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
|
||||
R"pbdoc(Returns the shape of the problem in units of logical tiles
|
||||
|
||||
:param problem_size: Implicit gemm problem size conv_operator(NZPQK, NDHWC, KTRSC)
|
||||
:type problem_size: :class:`cutlass.gemm.GemmCoord`)
|
||||
)pbdoc")
|
||||
.def("get_grid_shape", &T::get_grid_shape,
|
||||
py::arg("tiled_shape"),
|
||||
R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
|
||||
.def("tag", [](const T & swizzle){
|
||||
return demangle(typeid(T).name());
|
||||
}, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void bind_swizzle(py::module & m, std::string name, std::string doc) {
|
||||
py::class_<T>(m, name.c_str(), doc.c_str())
|
||||
.def(py::init<>())
|
||||
.def("get_tiled_shape",
|
||||
py::overload_cast<cutlass::gemm::GemmCoord, cutlass::gemm::GemmCoord, int>(
|
||||
&T::get_tiled_shape, py::const_
|
||||
), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
|
||||
R"pbdoc(Returns the shape of the problem in units of logical tiles
|
||||
|
||||
:param problem_size: gemm(M, N, K)
|
||||
:type problem_size: :class:`cutlass.gemm.GemmCoord`
|
||||
)pbdoc")
|
||||
.def("get_grid_shape", &T::get_grid_shape,
|
||||
py::arg("tiled_shape"),
|
||||
R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
|
||||
.def("tag", [](const T & swizzle){
|
||||
return demangle(typeid(T).name());
|
||||
}, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void bind_dgrad_swizzle(py::module & m, std::string name) {
|
||||
py::class_<T>(m, name.c_str(),
|
||||
R"pbdoc(Threadblock swizzling function for strided dgrad convolution)pbdoc")
|
||||
.def(py::init<>())
|
||||
.def("get_tiled_shape",
|
||||
py::overload_cast<cutlass::conv::Operator, const cutlass::conv::Conv2dProblemSize&, cutlass::gemm::GemmCoord, int>(
|
||||
&T::get_tiled_shape, py::const_
|
||||
), py::arg("conv_operator"), py::arg("problem_size"), py::arg("tile_size"), py::arg("split_k_slices"),
|
||||
R"pbdoc(Returns the shape of the problem in units of logical tiles
|
||||
|
||||
:param problem_size: Implicit gemm problem size conv_operator(NPQK, NHWC, KRSC)
|
||||
:type problem_size: :class:`cutlass.gemm.GemmCoord`)
|
||||
)pbdoc")
|
||||
.def("get_grid_shape", [](const T & swizzle, cutlass::gemm::GemmCoord tiled_shape) {
|
||||
return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
|
||||
}, py::arg("tiled_shape"),
|
||||
R"pbdoc(Computes CUDA grid dimensions given a size in units of logical tiles)pbdoc")
|
||||
.def("tag", [](const T & swizzle){
|
||||
return demangle(typeid(T).name());
|
||||
}, R"pbdoc(Returns the c++ name of the swizzling for code emission)pbdoc");
|
||||
}
|
||||
|
||||
void bind_threadblock_swizzle(py::module &m) {
|
||||
|
||||
py::class_<dim3>(m, "dim3",
|
||||
R"pbdoc(A int3 type xyz contains three integers)pbdoc")
|
||||
.def(py::init<int, int, int>(),
|
||||
py::arg("x"), py::arg("y"), py::arg("z"))
|
||||
.def_readwrite("x", &dim3::x, R"pbdoc(get value x)pbdoc")
|
||||
.def_readwrite("y", &dim3::y, R"pbdoc(get value y)pbdoc")
|
||||
.def_readwrite("z", &dim3::z, R"pbdoc(get value z)pbdoc");
|
||||
|
||||
bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>>(m, "IdentitySwizzle1");
|
||||
bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>>(m, "IdentitySwizzle2");
|
||||
bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>>(m, "IdentitySwizzle4");
|
||||
bind_identity_swizzle<cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>>(m, "IdentitySwizzle8");
|
||||
|
||||
bind_swizzle<cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle>(m, "HorizontalSwizzle", R"pbdoc(Threadblock swizzling function for GEMMs)pbdoc");
|
||||
bind_swizzle<cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle>(m, "BatchedIdentitySwizzle", R"pbdoc(Threadblock swizzling function for batched GEMMs)pbdoc");
|
||||
|
||||
bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>>(m, "StridedDgradIdentitySwizzle1");
|
||||
bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>>(m, "StridedDgradIdentitySwizzle4");
|
||||
bind_dgrad_swizzle<cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle>(m, "StridedDgradHorizontalSwizzle");
|
||||
}
|
||||
@ -1,78 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind Tensor Coord to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cutlass/tensor_coord.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
void bind_tensor_coord(py::module &m) {
|
||||
//
|
||||
// Tensor Coords
|
||||
// cutlass/include/cutlass/tensor_coord.h
|
||||
//
|
||||
|
||||
/// Defines a canonical 4D coordinate used by tensor operations.
|
||||
py::class_<cutlass::Tensor4DCoord>(m, "Tensor4DCoord",
|
||||
R"pbdoc(Defines a canonical 4D coordinate used by tensor operations)pbdoc")
|
||||
.def(py::init<int, int, int, int>(),
|
||||
py::arg("n"), py::arg("h"), py::arg("w"), py::arg("c"),
|
||||
R"pbdoc(Helper to construct from N, H, W, and C)pbdoc")
|
||||
.def("at", py::overload_cast<int>(&cutlass::Tensor4DCoord::at),
|
||||
py::arg("dim"),
|
||||
R"pbdoc(Gets the index of a given Coord element)pbdoc")
|
||||
.def("size", [](const cutlass::Tensor4DCoord & coord) {
|
||||
return coord.at(0) * coord.at(1) * coord.at(2) * coord.at(3);},
|
||||
R"pbdoc(The size of the tensor coord)pbdoc");
|
||||
|
||||
py::class_<cutlass::Coord<3>>(m, "Tensor3DCoord",
|
||||
R"pbdoc(Defines a canonical 3D coordinate used by tensor operations)pbdoc")
|
||||
.def("at", py::overload_cast<int>(&cutlass::Coord<3>::at),
|
||||
py::arg("dim"),
|
||||
R"pbdoc(Gets the index of a given Coord element)pbdoc");
|
||||
|
||||
// Matrix Size
|
||||
py::class_<cutlass::MatrixCoord>(m, "MatrixCoord",
|
||||
R"pbdoc(MatrixCoord wraps Coord<2, int> to provide a helper for accessing named dimensions. Classes
|
||||
expecting a coordinate in the rank=2 index space of a matrix should use MatrixCoord.)pbdoc")
|
||||
.def(py::init<int, int>(),
|
||||
py::arg("row"), py::arg("column"), R"pbdoc(Helper to construct from a row and column)pbdoc")
|
||||
.def("row", py::overload_cast<>(&cutlass::MatrixCoord::row),
|
||||
R"pbdoc(Returns the row of the coordinate)pbdoc")
|
||||
.def("column", py::overload_cast<>(&cutlass::MatrixCoord::column),
|
||||
R"pbdoc(Returns the column of the coordinate)pbdoc");
|
||||
|
||||
}
|
||||
@ -1,102 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind TensorRef and View to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cutlass/tensor_ref.h"
|
||||
#include "cutlass/tensor_view.h"
|
||||
#include "types.h"
|
||||
|
||||
|
||||
template<typename T, typename L, typename TF>
|
||||
void bind_tensor_ref_view(py::module &m, std::string name) {
|
||||
py::class_<cutlass::TensorRef<T, L>>(m, ("TensorRef" + name).c_str())
|
||||
.def("__init__", [](cutlass::TensorRef<T, L>& tensor_ref, int64_t address, const L& layout_ ) {
|
||||
T* ptr = reinterpret_cast< T*>(address);
|
||||
new (&tensor_ref) cutlass::TensorRef<T, L>(ptr, layout_);
|
||||
})
|
||||
.def("data", [](cutlass::TensorRef<T, L>& tensor_ref) {
|
||||
T* ptr = tensor_ref.data();
|
||||
return int64_t(ptr);
|
||||
})
|
||||
.def("layout", py::overload_cast<>(&cutlass::TensorRef<T, L>::layout));
|
||||
|
||||
m.def("get_tensor_ref", [](int64_t address, TF data, const L& layout_) {
|
||||
T* ptr = reinterpret_cast<T*>(address);
|
||||
cutlass::TensorRef<T, L> tensor_ref = cutlass::TensorRef<T, L>(ptr, layout_);
|
||||
return tensor_ref;
|
||||
});
|
||||
|
||||
py::class_<cutlass::TensorView<T, L>>(m, ("TensorView" + name).c_str())
|
||||
.def(py::init<const cutlass::TensorRef<T, L>&, const typename L::TensorCoord &>());
|
||||
}
|
||||
|
||||
|
||||
void bind_tensor_refs_and_views(py::module &m) {
|
||||
|
||||
/// float
|
||||
bind_tensor_ref_view<float, cutlass::layout::RowMajor, cutlass::float32>(m, "F32RowMajor");
|
||||
bind_tensor_ref_view<float, cutlass::layout::ColumnMajor, cutlass::float32>(m, "F32ColumnMajor");
|
||||
bind_tensor_ref_view<float, cutlass::layout::TensorNHWC, cutlass::float32>(m, "F32NHWC");
|
||||
|
||||
/// double
|
||||
bind_tensor_ref_view<double, cutlass::layout::RowMajor, cutlass::float64>(m, "F64RowMajor");
|
||||
bind_tensor_ref_view<double, cutlass::layout::ColumnMajor, cutlass::float64>(m, "F64ColumnMajor");
|
||||
bind_tensor_ref_view<double, cutlass::layout::TensorNHWC, cutlass::float64>(m, "F64NHWC");
|
||||
|
||||
// half_t
|
||||
bind_tensor_ref_view<cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t>(m, "F16RowMajor");
|
||||
bind_tensor_ref_view<cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t>(m, "F16ColumnMajor");
|
||||
bind_tensor_ref_view<cutlass::half_t, cutlass::layout::TensorNHWC, cutlass::half_t>(m, "F16NHWC");
|
||||
|
||||
// bfloat16
|
||||
bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t>(m, "BF16RowMajor");
|
||||
bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::ColumnMajor, cutlass::bfloat16_t>(m, "BF16ColumnMajor");
|
||||
bind_tensor_ref_view<cutlass::bfloat16_t, cutlass::layout::TensorNHWC, cutlass::bfloat16_t>(m, "BF16NHWC");
|
||||
|
||||
// int8_t
|
||||
bind_tensor_ref_view<int8_t, cutlass::layout::RowMajorInterleaved<32>, cutlass::int8>(m, "S8RowMajorInterleaved32");
|
||||
bind_tensor_ref_view<int8_t, cutlass::layout::ColumnMajorInterleaved<32>, cutlass::int8>(m, "S8ColumnMajorInterleaved32");
|
||||
bind_tensor_ref_view<int8_t, cutlass::layout::RowMajor, cutlass::int8>(m, "S8RowMajor");
|
||||
bind_tensor_ref_view<int8_t, cutlass::layout::ColumnMajor, cutlass::int8>(m, "S8ColumnMajor");
|
||||
bind_tensor_ref_view<int8_t, cutlass::layout::TensorNHWC, cutlass::int8>(m, "S8NHWC");
|
||||
bind_tensor_ref_view<int8_t, cutlass::layout::TensorNCxHWx<32>, cutlass::int8>(m, "S8NC32HW32");
|
||||
bind_tensor_ref_view<int8_t, cutlass::layout::TensorCxRSKx<32>, cutlass::int8>(m, "S8C32RSK32");
|
||||
|
||||
// int32_t
|
||||
bind_tensor_ref_view<int32_t, cutlass::layout::RowMajor, cutlass::int32>(m, "S32RowMajor");
|
||||
bind_tensor_ref_view<int32_t, cutlass::layout::ColumnMajor, cutlass::int32>(m, "S32ColumnMajor");
|
||||
bind_tensor_ref_view<int32_t, cutlass::layout::TensorNHWC, cutlass::int32>(m, "S32NHWC");
|
||||
}
|
||||
@ -1,146 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind CUTLASS types to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cutlass/half.h"
|
||||
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
namespace cutlass {
|
||||
|
||||
/// IEEE 32-bit signed integer
|
||||
struct alignas(1) int8 {
|
||||
int8_t storage;
|
||||
explicit int8(int x) {
|
||||
storage = int8_t(x);
|
||||
}
|
||||
explicit int8(float x) {
|
||||
storage = int8_t(x);
|
||||
}
|
||||
|
||||
int8_t c_value(){return storage;}
|
||||
};
|
||||
|
||||
/// IEEE 32-bit signed integer
|
||||
struct alignas(4) int32 {
|
||||
int storage;
|
||||
explicit int32(int x) {
|
||||
storage = x;
|
||||
}
|
||||
explicit int32(float x) {
|
||||
storage = int(x);
|
||||
}
|
||||
|
||||
int c_value(){return storage;}
|
||||
};
|
||||
/// IEEE single-precision floating-point type
|
||||
struct alignas(4) float32 {
|
||||
float storage;
|
||||
explicit float32(float x) {
|
||||
storage = x;
|
||||
}
|
||||
explicit float32(int x) {
|
||||
storage = float(x);
|
||||
}
|
||||
float c_value(){return storage;}
|
||||
};
|
||||
/// IEEE double-precision floating-point type
|
||||
struct alignas(4) float64 {
|
||||
double storage;
|
||||
explicit float64(float x) {
|
||||
storage = double(x);
|
||||
}
|
||||
explicit float64(int x) {
|
||||
storage = double(x);
|
||||
}
|
||||
double c_value(){return storage;}
|
||||
};
|
||||
}
|
||||
|
||||
void bind_cutlass_types(py::module &m) {
|
||||
|
||||
// s8
|
||||
py::class_<cutlass::int8>(m, "int8")
|
||||
.def(py::init<float>())
|
||||
.def(py::init<int>())
|
||||
.def_readwrite("storage", &cutlass::int8::storage)
|
||||
.def("value", &cutlass::int8::c_value);
|
||||
|
||||
// s32
|
||||
py::class_<cutlass::int32>(m, "int32")
|
||||
.def(py::init<float>())
|
||||
.def(py::init<int>())
|
||||
.def_readwrite("storage", &cutlass::int32::storage)
|
||||
.def("value", &cutlass::int32::c_value);
|
||||
|
||||
// f16
|
||||
py::class_<cutlass::half_t>(m, "float16")
|
||||
.def(py::init<float>())
|
||||
.def(py::init<double>())
|
||||
.def(py::init<int>())
|
||||
.def(py::init<unsigned>())
|
||||
.def_readwrite("storage", &cutlass::half_t::storage)
|
||||
.def("value", [](const cutlass::half_t& value) {return value;});
|
||||
|
||||
// bf16
|
||||
py::class_<cutlass::bfloat16_t>(m, "bfloat16")
|
||||
.def(py::init<float>())
|
||||
.def(py::init<int>())
|
||||
.def_readwrite("storage", &cutlass::bfloat16_t::storage)
|
||||
.def("value", [](const cutlass::bfloat16_t& value) {return value;});
|
||||
|
||||
// f32
|
||||
py::class_<cutlass::float32>(m, "float32")
|
||||
.def(py::init<float>())
|
||||
.def(py::init<int>())
|
||||
.def_readwrite("storage", &cutlass::float32::storage)
|
||||
.def("value", &cutlass::float32::c_value);
|
||||
|
||||
// tf32
|
||||
py::class_<cutlass::tfloat32_t>(m, "tfloat32")
|
||||
.def(py::init<float>())
|
||||
.def(py::init<int>())
|
||||
.def_readwrite("storage", &cutlass::tfloat32_t::storage)
|
||||
.def("value", [](const cutlass::tfloat32_t& value) {return value;});
|
||||
|
||||
// f64
|
||||
py::class_<cutlass::float64>(m, "float64")
|
||||
.def(py::init<float>())
|
||||
.def(py::init<int>())
|
||||
.def_readwrite("storage", &cutlass::float64::storage)
|
||||
.def("value", &cutlass::float64::c_value);
|
||||
}
|
||||
@ -1,32 +0,0 @@
|
||||
#include <cutlass/complex.h>
|
||||
|
||||
namespace cutlass {
|
||||
|
||||
/// ENUM class for datatypes
|
||||
enum class DataType {
|
||||
kB1, kU2, kU4, kU8,
|
||||
kU16, kU32, kU64, kS2,
|
||||
kS4, kS8, kS16, kS32,
|
||||
kS64, kF16, kBF16, kF32,
|
||||
kTF32, kF64, kCF16, kCBF16,
|
||||
kCF32, kCTF32, kCF64, kCS2,
|
||||
kCS4, kCS8, kCS16, kCS32,
|
||||
kCS64, kCU2, kCU4, kCU8,
|
||||
kCU16, kCU32, kCU64, kInvalid
|
||||
};
|
||||
|
||||
/// ENUM class for LayoutTypes
|
||||
enum class LayoutType {
|
||||
kColumnMajor, kRowMajor,
|
||||
kColumnMajorInterleaved2, kRowMajorInterleaved2,
|
||||
kColumnMajorInterleaved32, kRowMajorInterleaved32,
|
||||
kColumnMajorInterleaved64, kRowMajorInterleaved64,
|
||||
kTensorNHWC, kTensorNDHWC, kTensorNCHW, kTensorNGHWC,
|
||||
kTensorNC32HW32, kTensorNC64HW64, kTensorC32RSK32,
|
||||
kTensorC64RSK64
|
||||
};
|
||||
|
||||
/// ENUM class for opcode class
|
||||
|
||||
|
||||
} // namespace cutlass
|
||||
@ -1,54 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind convolution problems to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
|
||||
#include "unit/conv/device/conv2d_problems.h"
|
||||
#include "cutlass/conv/conv2d_problem_size.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
PYBIND11_MAKE_OPAQUE(std::vector<cutlass::conv::Conv2dProblemSize>);
|
||||
|
||||
void bind_conv_problem_size_test(py::module &m) {
|
||||
|
||||
py::bind_vector<std::vector<cutlass::conv::Conv2dProblemSize>>(m, "Conv2dProblemVector")
|
||||
.def("size", &std::vector<cutlass::conv::Conv2dProblemSize>::size);
|
||||
// Get Conv2d problem sizes
|
||||
py::class_<test::conv::device::TestbedConv2dProblemSizes>(m, "TestbedConv2dProblemSizes")
|
||||
.def(py::init<int>())
|
||||
.def_readonly("conv2d_default_sizes", &test::conv::device::TestbedConv2dProblemSizes::conv2d_default_sizes);
|
||||
}
|
||||
@ -1,49 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind convolution related types to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "conv_problems.h"
|
||||
#include "host.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
void bind_convolution_test(py::module &m) {
|
||||
// Conv problem sizes
|
||||
bind_conv_problem_size_test(m);
|
||||
|
||||
py::module_ host_submodule = m.def_submodule("host");
|
||||
bind_conv_host_references(host_submodule);
|
||||
}
|
||||
@ -1,180 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind Convolution host test helpers to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
#include "unit/conv/device/cache_testbed_output.h"
|
||||
|
||||
|
||||
#include "cutlass/util/reference/host/convolution.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
|
||||
template<typename Ta, typename La, typename Tb, typename Lb, typename Tc, typename Lc, typename Tacc, typename Te>
|
||||
void bind_conv2d_host(py::module &m) {
|
||||
m.def("conv2d", \
|
||||
&cutlass::reference::host::Conv2d< \
|
||||
Ta, La, Tb, Lb, Tc, Lc, Te, Tacc>);
|
||||
|
||||
m.def("CreateCachedConv2dTestKey", &test::conv::device::CreateCachedConv2dTestKey<Ta, La, Tb, Lb, Tc, Lc, Tacc, Te>);
|
||||
}
|
||||
|
||||
template<typename Ta, typename La, typename Tb, typename Lb, typename Tc, typename Lc, typename Tacc, typename Te>
|
||||
void bind_conv2d_host_sat(py::module &m) {
|
||||
m.def("conv2d", \
|
||||
&cutlass::reference::host::Conv2d< \
|
||||
Ta, La, Tb, Lb, Tc, Lc, Te, Tacc, cutlass::NumericConverterClamp<Tc, Te>>);
|
||||
|
||||
m.def("CreateCachedConv2dTestKey", &test::conv::device::CreateCachedConv2dTestKey<Ta, La, Tb, Lb, Tc, Lc, Tacc, Te>);
|
||||
}
|
||||
|
||||
template<typename Ta, typename Tb, typename Tc, typename Tacc, typename Te>
|
||||
void bind_conv2d_host_nhwc(py::module &m) {
|
||||
bind_conv2d_host<
|
||||
Ta, cutlass::layout::TensorNHWC,
|
||||
Tb, cutlass::layout::TensorNHWC,
|
||||
Tc, cutlass::layout::TensorNHWC,
|
||||
Tacc, Te>(m);
|
||||
}
|
||||
|
||||
template<typename Ta, typename Tb, typename Tc, typename Tacc, typename Te>
|
||||
void bind_conv2d_host_nc32hw32(py::module &m) {
|
||||
bind_conv2d_host_sat<
|
||||
Ta, cutlass::layout::TensorNCxHWx<32>,
|
||||
Tb, cutlass::layout::TensorCxRSKx<32>,
|
||||
Tc, cutlass::layout::TensorNCxHWx<32>,
|
||||
Tacc, Te>(m);
|
||||
}
|
||||
|
||||
|
||||
template<typename T, typename Layout>
|
||||
void bind_tensor_equals(py::module &m) {
|
||||
m.def("equals", py::overload_cast<
|
||||
const cutlass::TensorView<T, Layout>&, const cutlass::TensorView<T, Layout>&>(
|
||||
&cutlass::reference::host::TensorEquals<T, Layout>
|
||||
));
|
||||
}
|
||||
|
||||
#define BIND_TENSOR_HASH(Element, Layout) { \
|
||||
m.def("TensorHash", &test::conv::device::TensorHash<Element, Layout>, py::arg("view"), py::arg("hash") = test::conv::device::CRC32(), py::arg("crc")=uint32_t()); \
|
||||
}
|
||||
|
||||
void bind_conv_host_references(py::module &m) {
|
||||
//
|
||||
// Conv2d reference on host
|
||||
// tools/util/include/cutlass/util/reference/host/convolution.h
|
||||
|
||||
/// double
|
||||
bind_conv2d_host_nhwc<double, double, double, double, double>(m);
|
||||
/// float
|
||||
bind_conv2d_host_nhwc<float, float, float, float, float>(m);
|
||||
/// half
|
||||
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t>(m);
|
||||
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, cutlass::half_t>(m);
|
||||
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, float>(m);
|
||||
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, float>(m);
|
||||
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, cutlass::half_t>(m);
|
||||
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, float, cutlass::half_t>(m);
|
||||
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, float, float>(m);
|
||||
bind_conv2d_host_nhwc<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, float>(m);
|
||||
/// bfloat16
|
||||
bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, cutlass::bfloat16_t>(m);
|
||||
bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, float>(m);
|
||||
bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, cutlass::bfloat16_t>(m);
|
||||
bind_conv2d_host_nhwc<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, float>(m);
|
||||
/// s8
|
||||
bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
|
||||
bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
|
||||
bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
|
||||
bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
|
||||
bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, float>(m);
|
||||
bind_conv2d_host_nhwc<int8_t, int8_t, int8_t, int32_t, float>(m);
|
||||
bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, float>(m);
|
||||
bind_conv2d_host_nhwc<int8_t, int8_t, int32_t, int32_t, float>(m);
|
||||
|
||||
bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
|
||||
bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
|
||||
bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
|
||||
bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
|
||||
bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, float>(m);
|
||||
bind_conv2d_host_nc32hw32<int8_t, int8_t, int8_t, int32_t, float>(m);
|
||||
bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, float>(m);
|
||||
bind_conv2d_host_nc32hw32<int8_t, int8_t, int32_t, int32_t, float>(m);
|
||||
|
||||
//
|
||||
// Compare whether two tensors are equal
|
||||
//
|
||||
/// double
|
||||
bind_tensor_equals<double, cutlass::layout::TensorNHWC>(m);
|
||||
/// float
|
||||
bind_tensor_equals<float, cutlass::layout::TensorNHWC>(m);
|
||||
/// half
|
||||
bind_tensor_equals<cutlass::half_t, cutlass::layout::TensorNHWC>(m);
|
||||
/// bfloat16
|
||||
bind_tensor_equals<cutlass::bfloat16_t, cutlass::layout::TensorNHWC>(m);
|
||||
/// s32
|
||||
bind_tensor_equals<int32_t, cutlass::layout::TensorNHWC>(m);
|
||||
bind_tensor_equals<int32_t, cutlass::layout::TensorNCxHWx<32>>(m);
|
||||
/// s8
|
||||
bind_tensor_equals<int8_t, cutlass::layout::TensorNHWC>(m);
|
||||
bind_tensor_equals<int8_t, cutlass::layout::TensorNCxHWx<32>>(m);
|
||||
|
||||
/// Cache
|
||||
py::class_<test::conv::device::CachedTestKey>(m, "CachedTestKey")
|
||||
.def(py::init<>())
|
||||
.def(py::init<std::string, std::string, std::string, uint32_t, uint32_t, uint32_t>());
|
||||
|
||||
py::class_<test::conv::device::CachedTestResult>(m, "CachedTestResult")
|
||||
.def(py::init<>())
|
||||
.def(py::init<uint32_t>())
|
||||
.def_readwrite("D", &test::conv::device::CachedTestResult::D);
|
||||
|
||||
py::class_<test::conv::device::CachedTestResultListing>(m, "CachedTestResultListing")
|
||||
.def(py::init<const std::string &>())
|
||||
.def("find", &test::conv::device::CachedTestResultListing::find)
|
||||
.def("append", &test::conv::device::CachedTestResultListing::append)
|
||||
.def("write", &test::conv::device::CachedTestResultListing::write);
|
||||
|
||||
py::class_<test::conv::device::CRC32>(m, "CRC32")
|
||||
.def(py::init<>());
|
||||
|
||||
BIND_TENSOR_HASH(double, cutlass::layout::TensorNHWC)
|
||||
BIND_TENSOR_HASH(float, cutlass::layout::TensorNHWC);
|
||||
BIND_TENSOR_HASH(cutlass::half_t, cutlass::layout::TensorNHWC);
|
||||
BIND_TENSOR_HASH(cutlass::bfloat16_t, cutlass::layout::TensorNHWC);
|
||||
BIND_TENSOR_HASH(int32_t, cutlass::layout::TensorNHWC);
|
||||
BIND_TENSOR_HASH(int8_t, cutlass::layout::TensorNCxHWx<32>);
|
||||
}
|
||||
@ -1,45 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind gemm test to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "host.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
void bind_gemm_test(py::module &m) {
|
||||
py::module_ host_submodule = m.def_submodule("host");
|
||||
bind_gemm_host_reference(host_submodule);
|
||||
}
|
||||
@ -1,431 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/* \file
|
||||
\brief Bind gemm test host functions to python
|
||||
*/
|
||||
#pragma once
|
||||
#include <pybind11/pybind11.h>
|
||||
#include <pybind11/stl_bind.h>
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cutlass/util/reference/host/gemm.h"
|
||||
#include "cutlass/util/reference/host/tensor_compare.h"
|
||||
#include "cutlass/util/host_reorder.h"
|
||||
|
||||
#include "cutlass/functional.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
|
||||
|
||||
template<
|
||||
typename ElementA, typename LayoutA,
|
||||
typename ElementB, typename LayoutB,
|
||||
typename ElementC, typename LayoutC,
|
||||
typename AccumulatorType, typename ComputeType,
|
||||
typename InnerProductOp>
|
||||
void bind_host_gemm_saturate(py::module &m) {
|
||||
m.def("gemm_saturate", py::overload_cast<
|
||||
cutlass::gemm::GemmCoord, ComputeType,
|
||||
cutlass::TensorRef<ElementA, LayoutA>,
|
||||
cutlass::TensorRef<ElementB, LayoutB>,
|
||||
ComputeType,
|
||||
cutlass::TensorRef<ElementC, LayoutC>,
|
||||
cutlass::TensorRef<ElementC, LayoutC>,
|
||||
AccumulatorType>(
|
||||
&cutlass::reference::host::compute_gemm<
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
ComputeType,
|
||||
AccumulatorType,
|
||||
InnerProductOp,
|
||||
cutlass::NumericConverterClamp<ElementC, AccumulatorType>>
|
||||
));
|
||||
}
|
||||
|
||||
template<
|
||||
typename ElementA, typename LayoutA,
|
||||
typename ElementB, typename LayoutB,
|
||||
typename ElementC, typename LayoutC,
|
||||
typename AccumulatorType, typename ComputeType,
|
||||
typename InnerProductOp>
|
||||
void bind_host_gemm(py::module &m) {
|
||||
m.def("gemm", py::overload_cast<
|
||||
cutlass::gemm::GemmCoord, ComputeType,
|
||||
cutlass::TensorRef<ElementA, LayoutA>,
|
||||
cutlass::TensorRef<ElementB, LayoutB>,
|
||||
ComputeType,
|
||||
cutlass::TensorRef<ElementC, LayoutC>,
|
||||
cutlass::TensorRef<ElementC, LayoutC>,
|
||||
AccumulatorType>(
|
||||
&cutlass::reference::host::compute_gemm<
|
||||
ElementA, LayoutA,
|
||||
ElementB, LayoutB,
|
||||
ElementC, LayoutC,
|
||||
ComputeType,
|
||||
AccumulatorType,
|
||||
InnerProductOp,
|
||||
cutlass::NumericConverter<ElementC, AccumulatorType>>
|
||||
));
|
||||
}
|
||||
|
||||
|
||||
template<
|
||||
typename ElementA, typename ElementB, typename ElementC,
|
||||
typename AccumulatorType, typename ComputeType>
|
||||
void bind_host_gemm_multiply_add(py::module &m) {
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::RowMajor,
|
||||
ElementB, cutlass::layout::RowMajor,
|
||||
ElementC, cutlass::layout::RowMajor,
|
||||
ComputeType, AccumulatorType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::ColumnMajor,
|
||||
ElementB, cutlass::layout::RowMajor,
|
||||
ElementC, cutlass::layout::RowMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::RowMajor,
|
||||
ElementB, cutlass::layout::ColumnMajor,
|
||||
ElementC, cutlass::layout::RowMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::RowMajor,
|
||||
ElementB, cutlass::layout::RowMajor,
|
||||
ElementC, cutlass::layout::ColumnMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::RowMajor,
|
||||
ElementB, cutlass::layout::ColumnMajor,
|
||||
ElementC, cutlass::layout::ColumnMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::ColumnMajor,
|
||||
ElementB, cutlass::layout::RowMajor,
|
||||
ElementC, cutlass::layout::ColumnMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::ColumnMajor,
|
||||
ElementB, cutlass::layout::ColumnMajor,
|
||||
ElementC, cutlass::layout::RowMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::ColumnMajor,
|
||||
ElementB, cutlass::layout::ColumnMajor,
|
||||
ElementC, cutlass::layout::ColumnMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
}
|
||||
|
||||
template<
|
||||
typename ElementA, typename ElementB, typename ElementC,
|
||||
typename AccumulatorType, typename ComputeType>
|
||||
void bind_host_gemm_multiply_add_saturate(py::module &m) {
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::RowMajor,
|
||||
ElementB, cutlass::layout::RowMajor,
|
||||
ElementC, cutlass::layout::RowMajor,
|
||||
ComputeType, AccumulatorType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::ColumnMajor,
|
||||
ElementB, cutlass::layout::RowMajor,
|
||||
ElementC, cutlass::layout::RowMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::RowMajor,
|
||||
ElementB, cutlass::layout::ColumnMajor,
|
||||
ElementC, cutlass::layout::RowMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::RowMajor,
|
||||
ElementB, cutlass::layout::RowMajor,
|
||||
ElementC, cutlass::layout::ColumnMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::RowMajor,
|
||||
ElementB, cutlass::layout::ColumnMajor,
|
||||
ElementC, cutlass::layout::ColumnMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::ColumnMajor,
|
||||
ElementB, cutlass::layout::RowMajor,
|
||||
ElementC, cutlass::layout::ColumnMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::ColumnMajor,
|
||||
ElementB, cutlass::layout::ColumnMajor,
|
||||
ElementC, cutlass::layout::RowMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::ColumnMajor,
|
||||
ElementB, cutlass::layout::ColumnMajor,
|
||||
ElementC, cutlass::layout::ColumnMajor,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
}
|
||||
|
||||
|
||||
template<
|
||||
typename ElementA, typename ElementB, typename ElementC,
|
||||
typename AccumulatorType, typename ComputeType>
|
||||
void bind_host_gemm_multiply_add_interleaved(py::module &m) {
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ComputeType, AccumulatorType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::RowMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::RowMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::RowMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm<
|
||||
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
}
|
||||
|
||||
template<
|
||||
typename ElementA, typename ElementB, typename ElementC,
|
||||
typename AccumulatorType, typename ComputeType>
|
||||
void bind_host_gemm_multiply_add_saturate_interleaved(py::module &m) {
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ComputeType, AccumulatorType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::RowMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::RowMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::RowMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::RowMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
|
||||
bind_host_gemm_saturate<
|
||||
ElementA, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementB, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
ElementC, cutlass::layout::ColumnMajorInterleaved<32>,
|
||||
AccumulatorType, ComputeType,
|
||||
cutlass::multiply_add<AccumulatorType>>(m);
|
||||
}
|
||||
|
||||
#define BIND_TENSOR_EQUAL(Element, Layout) { \
|
||||
m.def("equals", py::overload_cast< \
|
||||
const cutlass::TensorView<Element, Layout>&, const cutlass::TensorView<Element, Layout>&>( \
|
||||
&cutlass::reference::host::TensorEquals<Element, Layout>)); \
|
||||
}
|
||||
|
||||
void bind_gemm_host_reference(py::module &m) {
|
||||
|
||||
/// double
|
||||
bind_host_gemm_multiply_add<double, double, double, double, double>(m);
|
||||
/// float
|
||||
bind_host_gemm_multiply_add<float, float, float, float, float>(m);
|
||||
/// half_t
|
||||
bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t, cutlass::half_t>(m);
|
||||
bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, cutlass::half_t, float, float>(m);
|
||||
bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, float, cutlass::half_t, cutlass::half_t>(m);
|
||||
bind_host_gemm_multiply_add<cutlass::half_t, cutlass::half_t, float, float, float>(m);
|
||||
/// bfloat16
|
||||
bind_host_gemm_multiply_add<cutlass::bfloat16_t, cutlass::bfloat16_t, cutlass::bfloat16_t, float, float>(m);
|
||||
bind_host_gemm_multiply_add<cutlass::bfloat16_t, cutlass::bfloat16_t, float, float, float>(m);
|
||||
|
||||
/// s8
|
||||
bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
|
||||
bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
|
||||
bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
|
||||
bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
|
||||
bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, float>(m);
|
||||
bind_host_gemm_multiply_add<int8_t, int8_t, int8_t, int32_t, float>(m);
|
||||
bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, float>(m);
|
||||
bind_host_gemm_multiply_add<int8_t, int8_t, int32_t, int32_t, float>(m);
|
||||
|
||||
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
|
||||
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
|
||||
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
|
||||
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
|
||||
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
|
||||
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
|
||||
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
|
||||
bind_host_gemm_multiply_add_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
|
||||
|
||||
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
|
||||
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
|
||||
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
|
||||
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
|
||||
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, float>(m);
|
||||
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int8_t, int32_t, float>(m);
|
||||
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, float>(m);
|
||||
bind_host_gemm_multiply_add_saturate<int8_t, int8_t, int32_t, int32_t, float>(m);
|
||||
|
||||
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, int32_t>(m);
|
||||
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, int8_t>(m);
|
||||
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, int32_t>(m);
|
||||
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, int8_t>(m);
|
||||
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
|
||||
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int8_t, int32_t, float>(m);
|
||||
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
|
||||
bind_host_gemm_multiply_add_saturate_interleaved<int8_t, int8_t, int32_t, int32_t, float>(m);
|
||||
|
||||
// float
|
||||
BIND_TENSOR_EQUAL(float, cutlass::layout::RowMajor);
|
||||
BIND_TENSOR_EQUAL(float, cutlass::layout::ColumnMajor);
|
||||
|
||||
// double
|
||||
BIND_TENSOR_EQUAL(double, cutlass::layout::RowMajor);
|
||||
BIND_TENSOR_EQUAL(double, cutlass::layout::ColumnMajor);
|
||||
|
||||
// half_t
|
||||
BIND_TENSOR_EQUAL(cutlass::half_t, cutlass::layout::RowMajor);
|
||||
BIND_TENSOR_EQUAL(cutlass::half_t, cutlass::layout::ColumnMajor);
|
||||
|
||||
// bfloat16
|
||||
BIND_TENSOR_EQUAL(cutlass::bfloat16_t, cutlass::layout::RowMajor);
|
||||
BIND_TENSOR_EQUAL(cutlass::bfloat16_t, cutlass::layout::ColumnMajor);
|
||||
|
||||
// int32_t
|
||||
BIND_TENSOR_EQUAL(int32_t, cutlass::layout::RowMajor);
|
||||
BIND_TENSOR_EQUAL(int32_t, cutlass::layout::ColumnMajor);
|
||||
|
||||
// int8_t
|
||||
BIND_TENSOR_EQUAL(int8_t, cutlass::layout::RowMajor);
|
||||
BIND_TENSOR_EQUAL(int8_t, cutlass::layout::ColumnMajor);
|
||||
BIND_TENSOR_EQUAL(int8_t, cutlass::layout::RowMajorInterleaved<32>);
|
||||
BIND_TENSOR_EQUAL(int8_t, cutlass::layout::ColumnMajorInterleaved<32>);
|
||||
|
||||
|
||||
}
|
||||
@ -1,55 +0,0 @@
|
||||
import re
|
||||
|
||||
|
||||
def SubstituteTemplate(template, values):
|
||||
text = template
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
for key, value in values.items():
|
||||
regex = "\\$\\{%s\\}" % key
|
||||
newtext = re.sub(regex, value, text)
|
||||
if newtext != text:
|
||||
changed = True
|
||||
text = newtext
|
||||
return text
|
||||
|
||||
from pycutlass.type_hint import *
|
||||
from pycutlass.tensor_ref import *
|
||||
from pycutlass.operation import *
|
||||
from pycutlass.epilogue import *
|
||||
from pycutlass.parser import *
|
||||
from pycutlass.compiler import ArtifactManager
|
||||
from pycutlass.memory_manager import *
|
||||
from pycutlass.arguments import *
|
||||
from pycutlass.library import *
|
||||
from pycutlass.c_types import *
|
||||
from pycutlass.gemm_operation import *
|
||||
from pycutlass.conv2d_operation import *
|
||||
from pycutlass.compiler import *
|
||||
from pycutlass.utils import *
|
||||
from pycutlass.frontend import *
|
||||
from pycutlass.reduction_operation import *
|
||||
from pycutlass.compiler import *
|
||||
from pycutlass.utils.device import device_cc
|
||||
|
||||
# module-wide variables
|
||||
|
||||
import sys
|
||||
this = sys.modules[__name__]
|
||||
|
||||
# artifact manager
|
||||
this.compiler = ArtifactManager()
|
||||
|
||||
try:
|
||||
if not hasattr(this, 'DEVICE_CC') or this.DEVICE_CC is None:
|
||||
this.DEVICE_CC = device_cc()
|
||||
except:
|
||||
this.DEVICE_CC = None
|
||||
|
||||
def get_memory_pool(init_pool_size=0, max_pool_size=2**34):
|
||||
this.memory_pool = PoolMemoryManager(
|
||||
init_pool_size=init_pool_size,
|
||||
max_pool_size=max_pool_size
|
||||
)
|
||||
return this.memory_pool
|
||||
@ -1,118 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
from .frontend import CupyFrontend
|
||||
from typeguard import typechecked
|
||||
from pycutlass.frontend import *
|
||||
from typing import Union
|
||||
import numpy as np
|
||||
from cuda import cuda
|
||||
try:
|
||||
import torch
|
||||
torch_available = True
|
||||
except ImportError:
|
||||
torch_available = False
|
||||
from cuda import cudart
|
||||
try:
|
||||
import cupy as cp
|
||||
cupy_available = True
|
||||
except ImportError:
|
||||
cupy_available = False
|
||||
|
||||
|
||||
# @typechecked
|
||||
class ArgumentBase:
|
||||
"""
|
||||
Base class for operation arguments
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
A: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
|
||||
B: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
|
||||
C: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
|
||||
D: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]',
|
||||
**kwargs) -> None:
|
||||
|
||||
# tensor_C can be interpreted as the bias with bias=True in keyword args
|
||||
if "bias" in kwargs.keys():
|
||||
self.bias = kwargs["bias"]
|
||||
else:
|
||||
# by default, tensor_C is not bias
|
||||
self.bias = False
|
||||
|
||||
# preprocessing input tensors
|
||||
if isinstance(A, np.ndarray):
|
||||
self.host_D = D
|
||||
self.buffer_A = NumpyFrontend.argument(A, False)
|
||||
self.buffer_B = NumpyFrontend.argument(B, False)
|
||||
self.buffer_C = NumpyFrontend.argument(C, False)
|
||||
self.buffer_D = NumpyFrontend.argument(D, True)
|
||||
self.ptr_A = self.buffer_A.ptr
|
||||
self.ptr_B = self.buffer_B.ptr
|
||||
self.ptr_C = self.buffer_C.ptr
|
||||
self.ptr_D = self.buffer_D.ptr
|
||||
# number of elements in C
|
||||
self.tensor_c_numel = C.size
|
||||
elif torch_available and isinstance(A, torch.Tensor):
|
||||
self.ptr_A = TorchFrontend.argument(A)
|
||||
self.ptr_B = TorchFrontend.argument(B)
|
||||
self.ptr_C = TorchFrontend.argument(C)
|
||||
self.ptr_D = TorchFrontend.argument(D)
|
||||
# number of elements in C
|
||||
self.tensor_c_numel = C.numel()
|
||||
elif isinstance(A, cuda.CUdeviceptr):
|
||||
self.ptr_A = A
|
||||
self.ptr_B = B
|
||||
self.ptr_C = C
|
||||
self.ptr_D = D
|
||||
|
||||
elif cupy_available and isinstance(A, cp.ndarray):
|
||||
self.ptr_A = CupyFrontend.argument(A)
|
||||
self.ptr_B = CupyFrontend.argument(B)
|
||||
self.ptr_C = CupyFrontend.argument(C)
|
||||
self.ptr_D = CupyFrontend.argument(D)
|
||||
# number of elements in C
|
||||
self.tensor_c_numel = C.size
|
||||
else:
|
||||
raise TypeError(
|
||||
"Unsupported Frontend. Only support numpy and torch")
|
||||
|
||||
def sync(self, stream_sync=True):
|
||||
if stream_sync:
|
||||
err, = cudart.cudaDeviceSynchronize()
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError("CUDA Error %s" % str(err))
|
||||
|
||||
if hasattr(self, "host_D"):
|
||||
err, = cuda.cuMemcpyDtoH(
|
||||
self.host_D, self.ptr_D, self.host_D.size * self.host_D.itemsize)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError("CUDA Error %s" % str(err))
|
||||
@ -1,395 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Utilities for stamping out collective mainloops for SM90 kernels
|
||||
"""
|
||||
|
||||
import cute
|
||||
import cutlass
|
||||
from pycutlass import SubstituteTemplate
|
||||
import pycutlass.library as library
|
||||
|
||||
|
||||
tma_alignment_bytes = 16
|
||||
cp_async_min_alignment_bytes = 4
|
||||
|
||||
|
||||
class RowColMajorToGMMAMajor:
|
||||
@staticmethod
|
||||
def A(layout, element):
|
||||
"""
|
||||
Converts operand A's layout from row/column major format into CuTe's GMMA major format
|
||||
|
||||
:param layout: layout of the A operand
|
||||
:type layout: cutlass.RowMajor or cutlass.ColumnMajor
|
||||
:param element: data type of the A operand
|
||||
|
||||
:return: C++ CuTe GMMA major format
|
||||
:rtype: cute.GMMAMajor
|
||||
"""
|
||||
type_requires_k_major = (element == cutlass.tfloat32) or (element == cutlass.int8)
|
||||
if layout == cutlass.ColumnMajor and not type_requires_k_major:
|
||||
return cute.GMMAMajor.MN
|
||||
else:
|
||||
return cute.GMMAMajor.K
|
||||
|
||||
@staticmethod
|
||||
def B(layout, element):
|
||||
"""
|
||||
Converts operand B's layout from row/column major format into CuTe's GMMA major format
|
||||
|
||||
:param layout: layout of the B operand
|
||||
:type layout: cutlass.RowMajor or cutlass.ColumnMajor
|
||||
:param element: data type of the B operand
|
||||
|
||||
:return: C++ CuTe GMMA major format
|
||||
:rtype: cute.GMMAMajor
|
||||
"""
|
||||
type_requires_k_major = (element == cutlass.tfloat32) or (element == cutlass.int8)
|
||||
if layout == cutlass.RowMajor and not type_requires_k_major:
|
||||
return cute.GMMAMajor.MN
|
||||
else:
|
||||
return cute.GMMAMajor.K
|
||||
|
||||
|
||||
def cluster_shape_to_tma(dim):
|
||||
"""
|
||||
Returns the TMA copy type for a given cluster dimension
|
||||
|
||||
:param dim: a given dimension of a cluster
|
||||
:type dim: layout
|
||||
|
||||
:return: C++ TMA copy time
|
||||
:rtype: str
|
||||
"""
|
||||
return 'cute::SM90_TMA_LOAD' if dim == 1 else 'cute::SM90_TMA_LOAD_MULTICAST'
|
||||
|
||||
|
||||
def make_cpasync_gmem_tiled_copy(thread_count, element, alignment, gmma_layout, dim_mn, dim_k):
|
||||
"""
|
||||
Returns a `make_tiled_copy` call for a given configuration
|
||||
|
||||
:param thread_count: number of threads in the threadblock
|
||||
:type thread_count: int
|
||||
:param element: datatype of the operand in question
|
||||
:param alignment: byte alignment of the operand in question
|
||||
:type alignment: int
|
||||
:param gmma_layout: GMMA layout of the operand in question
|
||||
:type gmma_layout: cute.GMMAMajor
|
||||
:param dim_mn: extent of the M/N dimension of the tile
|
||||
:type dim_mn: int
|
||||
:param dim_k: extent of the reduction dimension of the tile
|
||||
:type dim_k: int
|
||||
|
||||
:return: C++ call to `make_tiled_copy`
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
emission_str = """decltype(cute::make_tiled_copy(
|
||||
cute::Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint_byte_t<static_cast<int>(sizeof(${element})) * ${alignment}>>, ${element}>{},
|
||||
cute::Layout<cute::Shape<_${shape0_x}, _${shape0_y}>,
|
||||
cute::Stride<_${stride_x}, _${stride_y}>>{},
|
||||
cute::Layout<cute::Shape<_${shape1_x}, _${shape1_y}>>{}))"""
|
||||
if gmma_layout == cute.GMMAMajor.K:
|
||||
threads_major = dim_k // alignment
|
||||
threads_minor = thread_count // threads_major
|
||||
values = {
|
||||
'shape0_x': str(threads_minor),
|
||||
'shape0_y': str(threads_major),
|
||||
'stride_x': str(threads_major),
|
||||
'stride_y': '1',
|
||||
'shape1_x': '1',
|
||||
'shape1_y': str(alignment)
|
||||
}
|
||||
elif gmma_layout == cute.GMMAMajor.MN:
|
||||
threads_major = dim_mn // alignment
|
||||
threads_minor = thread_count // threads_major
|
||||
values = {
|
||||
'shape0_x': str(threads_major),
|
||||
'shape0_y': str(threads_minor),
|
||||
'stride_x': '1',
|
||||
'stride_y': str(threads_major),
|
||||
'shape1_x': str(alignment),
|
||||
'shape1_y': '1'
|
||||
}
|
||||
else:
|
||||
raise Exception('Unexpected GMMA layout {}'.format(gmma_layout))
|
||||
|
||||
# Add common values
|
||||
values['element'] = library.DataTypeTag[element]
|
||||
values['alignment'] = str(alignment)
|
||||
return SubstituteTemplate(emission_str, values)
|
||||
|
||||
|
||||
def max_stages(op, arch):
|
||||
"""
|
||||
Returns the maximum number pipeline stages that can be used for an operation.
|
||||
|
||||
:param op: operation for which the maximum stages should be computed. If stages are
|
||||
set via the `op.tile_description.stages` parameter, this setting is ignored
|
||||
in the present calculation
|
||||
:type op: pycutlass.GemmOperation
|
||||
:param arch: compute capability of the device on which the operation will be run
|
||||
:type arch: int
|
||||
|
||||
:return: maximum number of pipeline stages that can be used for an operation
|
||||
:rtype: int
|
||||
"""
|
||||
smem_per_stage = library.CalculateSmemUsagePerStage(op)
|
||||
smem_capacity = library.SharedMemPerCC[arch]
|
||||
return int(smem_capacity // smem_per_stage)
|
||||
|
||||
|
||||
class LayoutToStride:
|
||||
_variable_first = 'cute::Stride<int64_t, cute::Int<1>, int64_t>'
|
||||
_variable_last = 'cute::Stride<cute::Int<1>, int64_t, int64_t>'
|
||||
|
||||
@staticmethod
|
||||
def A(layout):
|
||||
"""
|
||||
Returns the CuTe shape type corresponding to the layout of operand A
|
||||
|
||||
:param layout: layout of the B operand
|
||||
:type layout: cutlass.RowMajor or cutlass.ColumnMajor
|
||||
|
||||
:return: C++ declaration of CuTe stride
|
||||
:rtype: str
|
||||
"""
|
||||
if layout == cutlass.RowMajor:
|
||||
return LayoutToStride._variable_first
|
||||
elif layout == cutlass.ColumnMajor:
|
||||
return LayoutToStride._variable_last
|
||||
else:
|
||||
raise Exception('Unsupported layout {}'.format(layout))
|
||||
|
||||
@staticmethod
|
||||
def B(layout):
|
||||
"""
|
||||
Returns the CuTe shape type corresponding to the layout of operand B
|
||||
|
||||
:param layout: layout of the B operand
|
||||
:type layout: cutlass.RowMajor or cutlass.ColumnMajor
|
||||
|
||||
:return: C++ declaration of CuTe stride
|
||||
:rtype: str
|
||||
"""
|
||||
if layout == cutlass.RowMajor:
|
||||
return LayoutToStride._variable_last
|
||||
elif layout == cutlass.ColumnMajor:
|
||||
return LayoutToStride._variable_first
|
||||
else:
|
||||
raise Exception('Unsupported layout {}'.format(layout))
|
||||
|
||||
|
||||
EMISSION_STR = """
|
||||
using TileShape_MNK = cute::Shape<_${threadblock_shape_m}, _${threadblock_shape_n}, _${threadblock_shape_k}>;
|
||||
using ClusterShape_MNK = cute::Shape<_${cluster_shape_m}, _${cluster_shape_n}, _${cluster_shape_k}>;
|
||||
using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
|
||||
${internal_element_A}, ${internal_element_B}, ${element_accumulator}, TileShape_MNK, ${gmma_layout_A}, ${gmma_layout_B}>()));
|
||||
|
||||
using SmemLayoutAtomA = decltype(cute::GMMA::smem_selector<${gmma_layout_A}, ${internal_element_A}, _${threadblock_shape_m}, _${threadblock_shape_k}>());
|
||||
using SmemLayoutAtomB = decltype(cute::GMMA::smem_selector<${gmma_layout_B}, ${internal_element_B}, _${threadblock_shape_n}, _${threadblock_shape_k}>());
|
||||
|
||||
using CollectiveOp = typename cutlass::gemm::collective::CollectiveMma<
|
||||
${mainloop_type}<${stage_count}, ClusterShape_MNK${kernel_schedule}>,
|
||||
TileShape_MNK,
|
||||
${element_A},
|
||||
${stride_A},
|
||||
${element_B},
|
||||
${stride_B},
|
||||
TiledMma,
|
||||
${gmem_tiled_copy_A},
|
||||
SmemLayoutAtomA,
|
||||
void, // GMMA_SS does not need an SmemCopyAtom
|
||||
${transform_A},
|
||||
${gmem_tiled_copy_B},
|
||||
SmemLayoutAtomB,
|
||||
void, // GMMA_SS does not need an SmemCopyAtom
|
||||
${transform_B}
|
||||
>;
|
||||
"""
|
||||
|
||||
|
||||
def internal_element(element):
|
||||
"""
|
||||
Returns the data type internally used for `element`.
|
||||
|
||||
:param element: data type
|
||||
|
||||
:return: data type used internally
|
||||
"""
|
||||
return cutlass.tfloat32 if element == cutlass.float32 else element
|
||||
|
||||
|
||||
def common_values(op, stage_count, transform_A, transform_B):
|
||||
"""
|
||||
Returns a dictionary containing common values to be substituted in the emission of the
|
||||
collective operation declaration. Values specific to a particular collective operation
|
||||
should be added to these.
|
||||
|
||||
:param op: GEMM operation for which to build a collective operation
|
||||
:type op: pycutlass.GemmOperation
|
||||
:param stage_count: number of pipeline stages to use in the operation
|
||||
:type stage_count: int
|
||||
:param transform_A: transformation to perform on the A operand
|
||||
:type transform_A: str
|
||||
:param transform_B: transformation to perform on the B operand
|
||||
:type transform_B: str
|
||||
|
||||
:return: dictionary containing values to substitute in emission string
|
||||
:rtype: dict
|
||||
"""
|
||||
internal_element_a = internal_element(op.A.element)
|
||||
internal_element_b = internal_element(op.B.element)
|
||||
|
||||
return {
|
||||
'threadblock_shape_m': str(op.tile_description.threadblock_shape[0]),
|
||||
'threadblock_shape_n': str(op.tile_description.threadblock_shape[1]),
|
||||
'threadblock_shape_k': str(op.tile_description.threadblock_shape[2]),
|
||||
'cluster_shape_m': str(op.tile_description.cluster_shape[0]),
|
||||
'cluster_shape_n': str(op.tile_description.cluster_shape[1]),
|
||||
'cluster_shape_k': str(op.tile_description.cluster_shape[2]),
|
||||
'element_A': library.DataTypeTag[op.A.element],
|
||||
'element_B': library.DataTypeTag[op.B.element],
|
||||
'internal_element_A': library.DataTypeTag[internal_element_a],
|
||||
'internal_element_B': library.DataTypeTag[internal_element_b],
|
||||
'element_accumulator': library.DataTypeTag[op.accumulator_type()],
|
||||
'gmma_layout_A': library.CuTeLayoutTag[RowColMajorToGMMAMajor.A(op.A.layout, internal_element_a)],
|
||||
'gmma_layout_B': library.CuTeLayoutTag[RowColMajorToGMMAMajor.B(op.B.layout, internal_element_b)],
|
||||
'stride_A': LayoutToStride.A(op.A.layout),
|
||||
'stride_B': LayoutToStride.B(op.B.layout),
|
||||
'stage_count': str(stage_count),
|
||||
'transform_A': transform_A,
|
||||
'transform_B': transform_B
|
||||
}
|
||||
|
||||
|
||||
def build_gmma_tma(op):
|
||||
"""
|
||||
Builds a collective operation declaration targeting TMA GMMA kernels
|
||||
|
||||
:param op: GEMM operation for which to build a collective operation
|
||||
:type op: pycutlass.GemmOperation
|
||||
|
||||
:return: string containing the C++ declaration of collective operation
|
||||
:rtype: str
|
||||
"""
|
||||
A_tma_aligned = (library.DataTypeSizeBytes[op.A.element] * op.A.alignment) % tma_alignment_bytes == 0
|
||||
B_tma_aligned = (library.DataTypeSizeBytes[op.B.element] * op.B.alignment) % tma_alignment_bytes == 0
|
||||
if not A_tma_aligned or not B_tma_aligned:
|
||||
raise Exception('Each of the A or B operands must be aligned to {} bytes to use TMA'.format(tma_alignment_bytes))
|
||||
|
||||
max_stage_count = max_stages(op, arch=90)
|
||||
if op.tile_description.stages is None:
|
||||
op.tile_description.stages = max_stage_count
|
||||
elif op.tile_description.stages > max_stage_count:
|
||||
raise Exception('Combination of threadblock shape, data types, and number of stages exceeds shared memory capacity.')
|
||||
|
||||
kernel_schedule = 'cutlass::gemm::KernelTmaWarpSpecialized'
|
||||
if op.tile_description.persistent:
|
||||
kernel_schedule = 'cutlass::gemm::KernelTmaWarpSpecializedPersistent'
|
||||
|
||||
transform_A = 'cute::identity'
|
||||
transform_B = 'cute::identity'
|
||||
values = common_values(op, op.tile_description.stages, transform_A, transform_B)
|
||||
specific_values = {
|
||||
'mainloop_type': 'cutlass::gemm::MainloopSm90TmaGmmaWarpSpecialized',
|
||||
'kernel_schedule': ', ' + kernel_schedule,
|
||||
'gmem_tiled_copy_A': cluster_shape_to_tma(op.tile_description.cluster_shape[1]),
|
||||
'gmem_tiled_copy_B': cluster_shape_to_tma(op.tile_description.cluster_shape[0])
|
||||
}
|
||||
values.update(specific_values)
|
||||
|
||||
return SubstituteTemplate(EMISSION_STR, values)
|
||||
|
||||
|
||||
def build_gmma_cpasync(op):
|
||||
"""
|
||||
Builds a collective operation declaration targeting cp.async GMMA kernels
|
||||
|
||||
:param op: GEMM operation for which to build a collective operation
|
||||
:type op: pycutlass.GemmOperation
|
||||
|
||||
:return: string containing the C++ declaration of collective operation
|
||||
:rtype: str
|
||||
"""
|
||||
A_cp_async_aligned = (library.DataTypeSizeBytes[op.A.element] * op.A.alignment) % cp_async_min_alignment_bytes == 0
|
||||
B_cp_async_aligned = (library.DataTypeSizeBytes[op.B.element] * op.B.alignment) % cp_async_min_alignment_bytes == 0
|
||||
if not A_cp_async_aligned or not B_cp_async_aligned:
|
||||
raise Exception('Each of the A or B operands must be aligned to {} bytes to use cp.async'.format(cp_async_min_alignment_bytes))
|
||||
|
||||
max_stage_count = max_stages(op, arch=90)
|
||||
if op.tile_description.stages is None:
|
||||
op.tile_description.stages = max_stage_count
|
||||
elif op.tile_description.stages > max_stage_count:
|
||||
raise Exception('Combination of threadblock shape, data types, and number of stages exceeds shared memory capacity.')
|
||||
|
||||
transform_A = 'cute::identity'
|
||||
transform_B = 'cute::identity'
|
||||
|
||||
thread_count = 128
|
||||
cpasync_copy_A = make_cpasync_gmem_tiled_copy(thread_count, op.A.element, op.A.alignment, RowColMajorToGMMAMajor.A(op.A.layout, op.A.element),
|
||||
op.tile_description.threadblock_shape[0], op.tile_description.threadblock_shape[2])
|
||||
cpasync_copy_B = make_cpasync_gmem_tiled_copy(thread_count, op.B.element, op.B.alignment, RowColMajorToGMMAMajor.B(op.B.layout, op.B.element),
|
||||
op.tile_description.threadblock_shape[1], op.tile_description.threadblock_shape[2])
|
||||
|
||||
values = common_values(op, op.tile_description.stages, transform_A, transform_B)
|
||||
specific_values = {
|
||||
'mainloop_type': 'cutlass::gemm::MainloopSm90CpAsyncGmma',
|
||||
'kernel_schedule': '',
|
||||
'gmem_tiled_copy_A': cpasync_copy_A,
|
||||
'gmem_tiled_copy_B': cpasync_copy_B
|
||||
}
|
||||
values.update(specific_values)
|
||||
|
||||
return SubstituteTemplate(EMISSION_STR, values)
|
||||
|
||||
|
||||
def build(operation):
|
||||
"""
|
||||
Builds a collective operation declaration targeting cp.async or TMA for GMMA kernels
|
||||
|
||||
:param operation: GEMM operation for which to build a collective operation
|
||||
:type operation: pycutlass.GemmOperation
|
||||
|
||||
:return: string containing the C++ declaration of collective operation
|
||||
:rtype: str
|
||||
"""
|
||||
A_tma_aligned = (library.DataTypeSizeBytes[operation.A.element] * operation.A.alignment) % tma_alignment_bytes == 0
|
||||
B_tma_aligned = (library.DataTypeSizeBytes[operation.B.element] * operation.B.alignment) % tma_alignment_bytes == 0
|
||||
tma_correct_size = (library.DataTypeSizeBytes[operation.A.element] == 2 and library.DataTypeSizeBytes[operation.B.element] == 2)
|
||||
tma_correct_layout = (operation.A.layout == cutlass.RowMajor or operation.B.layout == cutlass.ColumnMajor)
|
||||
if A_tma_aligned and B_tma_aligned and (tma_correct_size or tma_correct_layout):
|
||||
return build_gmma_tma(operation)
|
||||
else:
|
||||
return build_gmma_cpasync(operation)
|
||||
@ -1,279 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import ctypes
|
||||
from pycutlass.library import *
|
||||
|
||||
|
||||
class GemmCoord_(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("m", ctypes.c_int),
|
||||
("n", ctypes.c_int),
|
||||
("k", ctypes.c_int)
|
||||
]
|
||||
|
||||
def __init__(self, gemm_coord) -> None:
|
||||
for field_name, _ in self._fields_:
|
||||
setattr(self, field_name, getattr(gemm_coord, field_name)())
|
||||
|
||||
|
||||
class GemmCoordBatched_(ctypes.Structure):
|
||||
"""
|
||||
Wrapper around a GemmCoord that also contains batch count. This is used for encoding
|
||||
batched GEMM inputs to CUTLASS 3 GEMMs.
|
||||
"""
|
||||
_fields_ = [
|
||||
("m", ctypes.c_int),
|
||||
("n", ctypes.c_int),
|
||||
("k", ctypes.c_int),
|
||||
("batch_count", ctypes.c_int)
|
||||
]
|
||||
|
||||
def __init__(self, gemm_coord, batch_count) -> None:
|
||||
for field_name, _ in self._fields_[:-1]:
|
||||
setattr(self, field_name, getattr(gemm_coord, field_name)())
|
||||
setattr(self, "batch_count", batch_count)
|
||||
|
||||
|
||||
class MatrixCoord_(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("row", ctypes.c_int),
|
||||
("column", ctypes.c_int)
|
||||
]
|
||||
|
||||
|
||||
class dim3_(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("x", ctypes.c_int),
|
||||
("y", ctypes.c_int),
|
||||
("z", ctypes.c_int)
|
||||
]
|
||||
|
||||
|
||||
class StrideBatched_(ctypes.Structure):
|
||||
"""
|
||||
CUTLASS 3.0 strides for operands contain one static dimension and two variable dimensions. The
|
||||
variable dimensions represent the stride along non-unit-stride dimension of the row/column major
|
||||
layout, and the batch stride. This structure encodes the two variable dimensions.
|
||||
"""
|
||||
_fields_ = [
|
||||
("major_stride", ctypes.c_int64),
|
||||
("batch_stride", ctypes.c_int64)
|
||||
]
|
||||
|
||||
|
||||
dtype2ctype = {
|
||||
cutlass.float16: ctypes.c_uint16,
|
||||
cutlass.float32: ctypes.c_float,
|
||||
cutlass.float64: ctypes.c_double,
|
||||
cutlass.int32: ctypes.c_int32
|
||||
}
|
||||
|
||||
|
||||
def get_gemm_arguments_3x(epilogue_functor):
|
||||
|
||||
_EpilogueOutputOpParams = epilogue_functor.epilogue_type
|
||||
|
||||
class _GemmArguments(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("mode", ctypes.c_int),
|
||||
("problem_size", GemmCoordBatched_),
|
||||
("ptr_A", ctypes.c_void_p),
|
||||
("stride_A", StrideBatched_),
|
||||
("ptr_B", ctypes.c_void_p),
|
||||
("stride_B", StrideBatched_),
|
||||
("ptr_C", ctypes.c_void_p),
|
||||
("stride_C", StrideBatched_),
|
||||
("ptr_D", ctypes.c_void_p),
|
||||
("stride_D", StrideBatched_),
|
||||
("epilogue", _EpilogueOutputOpParams),
|
||||
]
|
||||
|
||||
return _GemmArguments, _EpilogueOutputOpParams
|
||||
|
||||
|
||||
def get_gemm_arguments(epilogue_functor):
|
||||
|
||||
_EpilogueOutputOpParams = epilogue_functor.epilogue_type
|
||||
|
||||
class _GemmArguments(ctypes.Structure):
|
||||
_fields_ = [
|
||||
# Arguments from UniversalArgumentsBase
|
||||
("mode", ctypes.c_int),
|
||||
("problem_size", GemmCoord_),
|
||||
("batch_count", ctypes.c_int),
|
||||
("batch_stride_D", ctypes.c_longlong),
|
||||
# Remaining arguments
|
||||
("epilogue", _EpilogueOutputOpParams),
|
||||
("ptr_A", ctypes.c_void_p),
|
||||
("ptr_B", ctypes.c_void_p),
|
||||
("ptr_C", ctypes.c_void_p),
|
||||
("ptr_D", ctypes.c_void_p),
|
||||
("batch_stride_A", ctypes.c_longlong),
|
||||
("batch_stride_B", ctypes.c_longlong),
|
||||
("batch_stride_C", ctypes.c_longlong),
|
||||
("stride_a", ctypes.c_longlong),
|
||||
("stride_b", ctypes.c_longlong),
|
||||
("stride_c", ctypes.c_longlong),
|
||||
("stride_d", ctypes.c_longlong),
|
||||
("lda", ctypes.c_longlong),
|
||||
("ldb", ctypes.c_longlong),
|
||||
("ldc", ctypes.c_longlong),
|
||||
("ldd", ctypes.c_longlong),
|
||||
("ptr_gather_A_indices", ctypes.c_void_p),
|
||||
("ptr_gather_B_indices", ctypes.c_void_p),
|
||||
("ptr_scatter_D_indices", ctypes.c_void_p)
|
||||
]
|
||||
|
||||
return _GemmArguments, _EpilogueOutputOpParams
|
||||
|
||||
|
||||
###########################################################################################
|
||||
# GEMM Grouped
|
||||
###########################################################################################
|
||||
|
||||
def get_gemm_grouped_arguments(epilogue_functor):
|
||||
_EpilogueOutputOpParams = epilogue_functor.epilogue_type
|
||||
|
||||
class _GEMMGroupedArguments(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("problem_sizes", ctypes.c_void_p),
|
||||
("problem_count", ctypes.c_int),
|
||||
("threadblock_count", ctypes.c_int),
|
||||
("output_op", _EpilogueOutputOpParams),
|
||||
("ptr_A", ctypes.c_void_p),
|
||||
("ptr_B", ctypes.c_void_p),
|
||||
("ptr_C", ctypes.c_void_p),
|
||||
("ptr_D", ctypes.c_void_p),
|
||||
("lda", ctypes.c_void_p),
|
||||
("ldb", ctypes.c_void_p),
|
||||
("ldc", ctypes.c_void_p),
|
||||
("ldd", ctypes.c_void_p),
|
||||
("host_problem_sizes", ctypes.c_void_p)
|
||||
]
|
||||
|
||||
return _GEMMGroupedArguments, _EpilogueOutputOpParams
|
||||
|
||||
############################################################################################
|
||||
# Convolution2D
|
||||
############################################################################################
|
||||
|
||||
class Conv2DProblemSize(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("N", ctypes.c_int),
|
||||
("H", ctypes.c_int),
|
||||
("W", ctypes.c_int),
|
||||
("C", ctypes.c_int),
|
||||
("P", ctypes.c_int),
|
||||
("Q", ctypes.c_int),
|
||||
("K", ctypes.c_int),
|
||||
("R", ctypes.c_int),
|
||||
("S", ctypes.c_int),
|
||||
("pad_h", ctypes.c_int),
|
||||
("pad_w", ctypes.c_int),
|
||||
("stride_h", ctypes.c_int),
|
||||
("stride_w", ctypes.c_int),
|
||||
("dilation_h", ctypes.c_int),
|
||||
("dilation_w", ctypes.c_int),
|
||||
("mode", ctypes.c_int), # kCrossCorrelation: 0, kConvolution: 1
|
||||
("split_k_slices", ctypes.c_int),
|
||||
("groups", ctypes.c_int)
|
||||
]
|
||||
|
||||
def __init__(self, problem_size) -> None:
|
||||
for field_name, _ in self._fields_:
|
||||
setattr(self, field_name, getattr(problem_size, field_name))
|
||||
|
||||
|
||||
class Layout4D(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("stride", ctypes.c_int * 3)
|
||||
]
|
||||
|
||||
def __init__(self, tensor_ref):
|
||||
stride = tensor_ref.stride()
|
||||
setattr(self, "stride", (stride.at(0), stride.at(1), stride.at(2)))
|
||||
|
||||
|
||||
class TensorRef_(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("ptr", ctypes.c_void_p),
|
||||
("layout", Layout4D)
|
||||
]
|
||||
|
||||
def __init__(self, tensor_ref):
|
||||
setattr(self, "ptr", tensor_ref.data())
|
||||
setattr(self, "layout", Layout4D(tensor_ref.layout()))
|
||||
|
||||
|
||||
class TensorRef2D_(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("ptr", ctypes.c_void_p),
|
||||
("stride", ctypes.c_int)
|
||||
]
|
||||
|
||||
|
||||
def get_conv2d_arguments(epilogue_functor):
|
||||
_EpilogueOutputOpParams = epilogue_functor.epilogue_type
|
||||
|
||||
class _Conv2dArguments(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("problem_size", Conv2DProblemSize), # 0
|
||||
("ref_A", TensorRef_), # 72
|
||||
("ref_B", TensorRef_), # 96
|
||||
("ref_C", TensorRef_), # 120
|
||||
("ref_D", TensorRef_), # 144
|
||||
("output_op", _EpilogueOutputOpParams), # 168
|
||||
("split_k_mode", ctypes.c_int) # 192
|
||||
]
|
||||
|
||||
return _Conv2dArguments, _EpilogueOutputOpParams
|
||||
|
||||
|
||||
############################################################################################
|
||||
# Reduction
|
||||
############################################################################################
|
||||
|
||||
def get_reduction_params(epilogue_functor):
|
||||
_EpilogueOutputParams = epilogue_functor.epilogue_type
|
||||
|
||||
class _ReductionParams(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("problem_size", MatrixCoord_),
|
||||
("partitions", ctypes.c_int),
|
||||
("partition_stride", ctypes.c_longlong),
|
||||
("workspace", TensorRef2D_),
|
||||
("destination", TensorRef2D_),
|
||||
("source", TensorRef2D_),
|
||||
("output_op", _EpilogueOutputParams)
|
||||
]
|
||||
return _ReductionParams, _EpilogueOutputParams
|
||||
@ -1,460 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
import pycutlass
|
||||
from pycutlass import *
|
||||
import cutlass
|
||||
from cuda import cuda
|
||||
from cuda import nvrtc
|
||||
import tempfile
|
||||
import os
|
||||
import ctypes
|
||||
|
||||
#
|
||||
import json
|
||||
import sqlite3
|
||||
|
||||
|
||||
IncludeTemplate = r'''#include "${include}"
|
||||
'''
|
||||
|
||||
#
|
||||
|
||||
|
||||
class CompilationOptions:
|
||||
'''
|
||||
Compilation options.
|
||||
'''
|
||||
|
||||
#
|
||||
def __init__(self, flags, arch, include_paths=[]):
|
||||
self.includes = []
|
||||
self.include_paths = include_paths
|
||||
self.flags = flags
|
||||
self.arch = arch
|
||||
|
||||
def get_str(self):
|
||||
options = ""
|
||||
|
||||
for flag in self.flags:
|
||||
options += " " + flag
|
||||
|
||||
for incl in self.include_paths:
|
||||
options += ' --include-path=%s' % incl
|
||||
|
||||
arch_flag = " -arch=sm_%d" % self.arch
|
||||
if self.arch == 90:
|
||||
arch_flag += 'a'
|
||||
options += arch_flag
|
||||
|
||||
return options
|
||||
|
||||
#
|
||||
def get(self):
|
||||
options = []
|
||||
|
||||
for flag in self.flags:
|
||||
options.append(bytes(str.encode(flag)))
|
||||
|
||||
for incl in self.include_paths:
|
||||
options.append(bytes(str.encode('--include-path=%s' % incl)))
|
||||
|
||||
arch_flag = " -arch=sm_%d" % self.arch
|
||||
if self.arch == 90:
|
||||
arch_flag += 'a'
|
||||
|
||||
options.append(bytes(str.encode(arch_flag)))
|
||||
|
||||
return options
|
||||
|
||||
|
||||
def convertToBinaryData(filename):
|
||||
with open(filename, 'rb') as file:
|
||||
blobData = file.read()
|
||||
return blobData
|
||||
|
||||
|
||||
def CDLLBin(host_binary):
|
||||
tempfile.tempdir = "./"
|
||||
temp_so = tempfile.NamedTemporaryFile(
|
||||
prefix='host_func', suffix='.so', delete=True)
|
||||
with open(temp_so.name, 'wb') as file:
|
||||
file.write(host_binary)
|
||||
host_lib = ctypes.CDLL(temp_so.name)
|
||||
return host_lib
|
||||
|
||||
|
||||
class ArtifactManager:
|
||||
"""
|
||||
Artifact manager
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
try:
|
||||
connection = sqlite3.connect("./compiled_cache.db")
|
||||
cursor = connection.cursor()
|
||||
sqlite_create_table_query = """CREATE TABLE compiled_operations(op_key TEXT NOT NULL UNIQUE, cubin BLOB NOT NULL, hostbin BLOB NOT NULL, op_name TEXT NOT NULL, op_attrs TEXT NOT NULL)"""
|
||||
cursor.execute(sqlite_create_table_query)
|
||||
connection.commit()
|
||||
cursor.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
self.nvcc()
|
||||
self.compiled_cache_device = cutlass.CompileCache()
|
||||
self.compiled_cache_host = cutlass.CompileCache()
|
||||
|
||||
def nvrtc(self):
|
||||
self.backend = "nvrtc"
|
||||
self.default_compile_options = [
|
||||
'-std=c++17', '-default-device'
|
||||
]
|
||||
def nvcc(self):
|
||||
self.backend = "nvcc"
|
||||
self.default_compile_options = [
|
||||
'-std=c++17', '--expt-relaxed-constexpr', '-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored'
|
||||
]
|
||||
def insert_operation(self, op_key, cubin, hostfile, op_name, op_attrs):
|
||||
connection = sqlite3.connect("./compiled_cache.db")
|
||||
cursor = connection.cursor()
|
||||
sqlite_insert_blob_query = """ INSERT OR IGNORE INTO compiled_operations (op_key, cubin, hostbin, op_name, op_attrs) VALUES (?, ?, ?, ?, ?)"""
|
||||
|
||||
hostbin = convertToBinaryData(hostfile)
|
||||
|
||||
data_tuple = (op_key, cubin, hostbin, op_name, json.dumps(op_attrs))
|
||||
|
||||
cursor.execute(sqlite_insert_blob_query, data_tuple)
|
||||
connection.commit()
|
||||
cursor.close()
|
||||
|
||||
def load_operation(self, op_key, extra_funcs):
|
||||
connection = sqlite3.connect("./compiled_cache.db")
|
||||
cursor = connection.cursor()
|
||||
sqlite_fetch_blob_query = """SELECT * from compiled_operations where op_key = ?"""
|
||||
# try:
|
||||
cursor.execute(sqlite_fetch_blob_query, (op_key, ))
|
||||
record = cursor.fetchall()
|
||||
if len(record) == 0:
|
||||
return False
|
||||
for row in record:
|
||||
key, cubin_image, host_binary, operation_name, op_attr = row
|
||||
op_attr = json.loads(op_attr)
|
||||
err, module = cuda.cuModuleLoadData(cubin_image)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError('Cuda Error: {}'.format(err))
|
||||
|
||||
err, kernel = cuda.cuModuleGetFunction(
|
||||
module, bytes(str.encode(operation_name)))
|
||||
self.compiled_cache_device.insert(key, kernel)
|
||||
|
||||
compiled_host_fns = {}
|
||||
host_lib = CDLLBin(host_binary)
|
||||
|
||||
func_name = operation_name + '_get_params'
|
||||
func = getattr(host_lib, func_name)
|
||||
func.restype = ctypes.POINTER(ctypes.c_char * op_attr[0])
|
||||
compiled_host_fns['get_args'] = func
|
||||
|
||||
func_name = operation_name + '_shared_memory_size'
|
||||
func = getattr(host_lib, func_name)
|
||||
compiled_host_fns['shared_memory_capacity'] = func()
|
||||
|
||||
for attr in op_attr:
|
||||
if isinstance(attr, str):
|
||||
func_name = operation_name + '_' + attr
|
||||
func = getattr(host_lib, func_name)
|
||||
|
||||
# Set the return type of the function
|
||||
if attr in extra_funcs and extra_funcs[attr] != None:
|
||||
func.restype = extra_funcs[attr]
|
||||
|
||||
compiled_host_fns[attr] = func
|
||||
|
||||
self.compiled_cache_host.insert(key, compiled_host_fns)
|
||||
return True
|
||||
|
||||
def emit_compile_(self, operation_list, compilation_options, requires_nvcc_hostlib_compilation):
|
||||
"""
|
||||
Compile a list of kernels and store them into database
|
||||
"""
|
||||
source_buffer_device = ""
|
||||
source_buffer_host = ""
|
||||
# 1. include
|
||||
includes = []
|
||||
for operation in operation_list:
|
||||
for incl in operation.emitter.includes:
|
||||
if incl not in includes:
|
||||
includes.append(incl)
|
||||
|
||||
includes_host = [
|
||||
"builtin_types.h", "device_launch_parameters.h", "stddef.h"] + includes
|
||||
for incl in includes:
|
||||
source_buffer_device += SubstituteTemplate(
|
||||
IncludeTemplate, {'include': incl})
|
||||
|
||||
for incl in includes_host:
|
||||
if "/device/" not in incl:
|
||||
source_buffer_host += SubstituteTemplate(
|
||||
IncludeTemplate, {'include': incl})
|
||||
|
||||
# 2. Operations
|
||||
for operation in operation_list:
|
||||
source_buffer_device += operation.emit()
|
||||
source_buffer_host += operation.emit()
|
||||
values = {
|
||||
'operation_name': operation.name(),
|
||||
'operation_suffix': operation.emitter.operation_suffix
|
||||
}
|
||||
source_buffer_device += SubstituteTemplate(
|
||||
operation.KernelTemplate, values)
|
||||
source_buffer_host += SubstituteTemplate(
|
||||
operation.HostTemplate, values)
|
||||
|
||||
if self.backend == "nvrtc":
|
||||
# 3. compile
|
||||
err, program = nvrtc.nvrtcCreateProgram(
|
||||
str.encode(source_buffer_device),
|
||||
bytes(str.encode("module.cu")),
|
||||
0, [], [])
|
||||
|
||||
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
||||
raise RuntimeError('NVRTC Error: {}'.format(err))
|
||||
|
||||
# Compile program
|
||||
options = compilation_options.get()
|
||||
|
||||
err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
|
||||
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
||||
|
||||
error_string = 'NVRTC Error: {}\n'.format(err)
|
||||
|
||||
# Get log from compilation
|
||||
err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
|
||||
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
||||
raise RuntimeError('NVRTC Error: {}'.format(err))
|
||||
|
||||
log = b' ' * logSize
|
||||
err, = nvrtc.nvrtcGetProgramLog(program, log)
|
||||
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
||||
raise RuntimeError('NVRTC Error: {}'.format(err))
|
||||
|
||||
raise RuntimeError(
|
||||
error_string + log.decode() + source_buffer_device)
|
||||
|
||||
# Get data from compilation
|
||||
err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
|
||||
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
||||
raise RuntimeError('NVRTC Error: {}'.format(err))
|
||||
|
||||
cubin_image = b' ' * dataSize
|
||||
err, = nvrtc.nvrtcGetCUBIN(program, cubin_image)
|
||||
if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
|
||||
raise RuntimeError('NVRTC Error: {}'.format(err))
|
||||
|
||||
else: # with nvcc backend
|
||||
# emit code
|
||||
tempfile.tempdir = "./"
|
||||
temp_cu = tempfile.NamedTemporaryFile(
|
||||
prefix='kernel', suffix='.cu', delete=True)
|
||||
temp_cubin = tempfile.NamedTemporaryFile(
|
||||
prefix='kernel', suffix='.cubin', delete=True)
|
||||
with open(temp_cu.name, 'w') as file:
|
||||
file.write(source_buffer_device)
|
||||
|
||||
# compile with nvcc
|
||||
cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
|
||||
assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
|
||||
cmd_template = "${cuda_install_path}/bin/nvcc ${options} -cubin ${srcfile} -o ${tarfile}"
|
||||
values = {
|
||||
"cuda_install_path": cuda_install_path,
|
||||
"options": compilation_options.get_str(),
|
||||
"srcfile": temp_cu.name,
|
||||
"tarfile": temp_cubin.name
|
||||
}
|
||||
cmd = SubstituteTemplate(cmd_template, values)
|
||||
os.system(cmd)
|
||||
|
||||
# load the cubin image
|
||||
with open(temp_cubin.name, 'rb') as file:
|
||||
cubin_image = file.read()
|
||||
|
||||
# Set up the host-side library code
|
||||
if requires_nvcc_hostlib_compilation:
|
||||
cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
|
||||
assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
|
||||
cmd_template = "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}" % source_buffer_host
|
||||
cmd = SubstituteTemplate(
|
||||
cmd_template,
|
||||
{
|
||||
"cuda_install_path": cuda_install_path,
|
||||
"options": compilation_options.get_str()
|
||||
})
|
||||
else:
|
||||
options = compilation_options.get()
|
||||
cmd = "echo '%s'|g++ -x c++ -fpermissive -w -fPIC" % source_buffer_host
|
||||
filtered_opts = ['-default-device', '-Xcicc', '-Xllc', '--expt-relaxed-constexpr', '-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored']
|
||||
for opt in options:
|
||||
opt = opt.decode("utf-8")
|
||||
if opt not in filtered_opts and '-arch=sm_' not in opt:
|
||||
if '--include-path=' in opt:
|
||||
cmd += " " + opt.replace('--include-path=', '-I')
|
||||
else:
|
||||
cmd += " " + opt
|
||||
|
||||
tempfile.tempdir = "./"
|
||||
temp = tempfile.NamedTemporaryFile(
|
||||
prefix='host_func', suffix='.so', delete=True)
|
||||
|
||||
cmd += ' - -shared -o %s -lcudart -lcuda' % temp.name
|
||||
os.system(cmd)
|
||||
host_lib = ctypes.CDLL(temp.name)
|
||||
|
||||
return cubin_image, host_lib, temp
|
||||
|
||||
def add_module(self, operations, compile_options=None):
|
||||
"""
|
||||
Insert a new compiled device module
|
||||
"""
|
||||
if compile_options is None:
|
||||
cutlass_path = os.getenv('CUTLASS_PATH')
|
||||
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
|
||||
cuda_install_path = os.getenv('CUDA_INSTALL_PATH')
|
||||
assert cuda_install_path is not None, "Environment variable 'CUDA_INSTALL_PATH' is not defined."
|
||||
include_paths = [
|
||||
cuda_install_path + '/include',
|
||||
cutlass_path + '/include',
|
||||
cutlass_path + '/tools/util/include',
|
||||
cutlass_path + '/tools/library/scripts/pycutlass/src/cpp/include'
|
||||
]
|
||||
|
||||
if pycutlass.DEVICE_CC is not None:
|
||||
arch = pycutlass.DEVICE_CC
|
||||
else:
|
||||
# Find the maximum arch tag among the provided operations and compile for that target.
|
||||
# Since we are compiling to .cubin files, only one architecture may be specified.
|
||||
arch = max([op.arch for op in operations])
|
||||
compile_options = CompilationOptions(
|
||||
self.default_compile_options, arch, include_paths)
|
||||
# save the cubin
|
||||
operation_key = []
|
||||
operation_list = []
|
||||
requires_nvcc_hostlib_compilation = False
|
||||
for operation in operations:
|
||||
# step 1: get kernel string as key
|
||||
key = operation.rt_module.emit() + operation.procedural_name() + self.backend
|
||||
# step 1: check if the operation is in cache
|
||||
compiled_kernel = self.compiled_cache_device.at(key)
|
||||
|
||||
if compiled_kernel is None:
|
||||
hit = self.load_operation(key, getattr(operation.rt_module, 'extra_funcs', {}))
|
||||
if hit:
|
||||
compiled_kernel = self.compiled_cache_device.at(key)
|
||||
assert compiled_kernel is not None
|
||||
if compiled_kernel is not None:
|
||||
operation.rt_module.kernel = compiled_kernel
|
||||
compiled_host_fns = self.compiled_cache_host.at(key)
|
||||
assert compiled_host_fns is not None
|
||||
for key in compiled_host_fns.keys():
|
||||
setattr(operation.rt_module, key, compiled_host_fns[key])
|
||||
operation.rt_module.initialize()
|
||||
else:
|
||||
operation_list.append(operation.rt_module)
|
||||
operation_key.append(key)
|
||||
|
||||
# Creating the Params structures for certain 3.0 kernels currently requires CUDA. For these cases, use NVCC to generate
|
||||
# the PyCUTLASS host-side library. Otherwise, g++ will be used.
|
||||
if isinstance(operation, pycutlass.gemm_operation.GemmOperationUniversal) and operation.api == pycutlass.library.ApiVersion.v3x:
|
||||
if self.backend == "nvrtc":
|
||||
raise RuntimeError('CUTLASS 3 kernels currently require NVCC for compilation.')
|
||||
|
||||
requires_nvcc_hostlib_compilation = True
|
||||
|
||||
if len(operation_list) > 0:
|
||||
cubin_image, host_lib, host_file = self.emit_compile_(
|
||||
operation_list, compile_options, requires_nvcc_hostlib_compilation)
|
||||
|
||||
err, module = cuda.cuModuleLoadData(cubin_image)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError('Cuda Error: {}'.format(err))
|
||||
|
||||
operation_name = []
|
||||
operation_attr = []
|
||||
for operation, key in zip(operation_list, operation_key):
|
||||
# get device kernels
|
||||
err, operation.kernel = cuda.cuModuleGetFunction(
|
||||
module,
|
||||
bytes(str.encode(operation.name()))
|
||||
)
|
||||
operation_name.append(operation.name())
|
||||
self.compiled_cache_device.insert(key, operation.kernel)
|
||||
# get host functions
|
||||
compiled_host_fns = {}
|
||||
op_attr = []
|
||||
|
||||
# get param size
|
||||
func_name = operation.name() + '_get_param_size'
|
||||
func = getattr(host_lib, func_name)
|
||||
param_size = func()
|
||||
|
||||
func_name = operation.name() + '_get_params'
|
||||
func = getattr(host_lib, func_name)
|
||||
func.argtype = operation.argtype
|
||||
func.restype = ctypes.POINTER(ctypes.c_char * param_size)
|
||||
setattr(operation, 'get_args', func)
|
||||
compiled_host_fns['get_args'] = func
|
||||
|
||||
# set shared memory size
|
||||
func_name = operation.name() + '_shared_memory_size'
|
||||
func = getattr(host_lib, func_name)
|
||||
setattr(operation, 'shared_memory_capacity', func())
|
||||
compiled_host_fns['shared_memory_capacity'] = func()
|
||||
# set the maximum dynamic shared size
|
||||
operation.initialize()
|
||||
|
||||
# get extra functions
|
||||
op_attr.append(param_size)
|
||||
|
||||
if hasattr(operation, "extra_funcs"):
|
||||
for suffix, ret_type in operation.extra_funcs.items():
|
||||
func_name = operation.name() + '_' + suffix
|
||||
func = getattr(host_lib, func_name)
|
||||
if ret_type is not None:
|
||||
func.restype = ret_type
|
||||
setattr(operation, suffix, func)
|
||||
compiled_host_fns[suffix] = func
|
||||
op_attr.append(suffix)
|
||||
|
||||
operation_attr.append(op_attr)
|
||||
self.compiled_cache_host.insert(key, compiled_host_fns)
|
||||
|
||||
for key, operation_name, operation_attr in zip(operation_key, operation_name, operation_attr):
|
||||
self.insert_operation(
|
||||
key, cubin_image, host_file.name, operation_name, operation_attr)
|
||||
@ -1,632 +0,0 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
from typeguard import typechecked
|
||||
from cuda import cuda
|
||||
from typing import Union
|
||||
import numpy as np
|
||||
|
||||
from typeguard import typechecked
|
||||
|
||||
from pycutlass import *
|
||||
|
||||
|
||||
# @typechecked
|
||||
class Conv2dArguments(ArgumentBase):
|
||||
"""
|
||||
Argument wrapper for Conv2d. It encodes problem information and
|
||||
user-provide tensors into the kernel's argument.
|
||||
|
||||
:param operation: the Conv2d operation to take the argument
|
||||
:type operation: :class:`pycutlass.Conv2dOperation`
|
||||
|
||||
:param problem_size: the Conv2d problem size
|
||||
:type problem_size: :class:`cutlass.conv.Conv2dProblemSize`
|
||||
|
||||
:param A: tensor A
|
||||
:type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
|
||||
|
||||
:param B: tensor B
|
||||
:type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
|
||||
|
||||
:param C: tensor C
|
||||
:type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
|
||||
|
||||
:param D: tensor D
|
||||
:type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
|
||||
|
||||
:param split_k_mode: conv2d split K mode, defaults to
|
||||
cutlass.conv.SplitKMode.Serial
|
||||
:type split_k_mode: cutlass.conv.SplitKMode, optional
|
||||
|
||||
:param output_op: output operator, optional
|
||||
:type output_op: :class:`pycutlass.LinearCombinationFunctorArguments`
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, operation: 'Conv2dOperation',
|
||||
problem_size: 'cutlass.conv.Conv2dProblemSize',
|
||||
A: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
|
||||
B: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
|
||||
C: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
|
||||
D: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
|
||||
split_k_mode: 'cutlass.conv.SplitKMode'
|
||||
= cutlass.conv.SplitKMode.Serial, **kwargs) -> None:
|
||||
|
||||
self.operation = operation
|
||||
#: convolution kind
|
||||
self.conv_kind: cutlass.conv.Operator = operation.conv_kind
|
||||
self.layout_A: cutlass.layout = operation.A.layout
|
||||
self.layout_B: cutlass.layout = operation.B.layout
|
||||
self.layout_C: cutlass.layout = operation.C.layout
|
||||
|
||||
self.element_A = operation.A.element
|
||||
self.element_B = operation.B.element
|
||||
self.element_C = operation.C.element
|
||||
|
||||
if self.layout_C == cutlass.TensorNC32HW32:
|
||||
B = self.reorder_tensor_B(B, problem_size)
|
||||
|
||||
super().__init__(A, B, C, D, **kwargs)
|
||||
# preprocessing output ops
|
||||
|
||||
if 'output_op' in kwargs.keys() and \
|
||||
split_k_mode != cutlass.conv.SplitKMode.Parallel:
|
||||
self.output_op = kwargs['output_op']
|
||||
else:
|
||||
self.output_op = self.operation.epilogue_type(1.0, 0.0)
|
||||
|
||||
if "split_k_slices" in kwargs.keys():
|
||||
self.split_k_mode = split_k_mode
|
||||
self.split_k_slices = kwargs["split_k_slices"]
|
||||
else:
|
||||
self.split_k_mode = cutlass.conv.SplitKMode.Serial
|
||||
self.split_k_slices = 1
|
||||
|
||||
#: problem_size
|
||||
self.problem_size: cutlass.conv.Conv2dProblemSize = problem_size
|
||||
self.problem_size.split_k_slices = self.split_k_slices
|
||||
|
||||
if hasattr(self, "tensor_c_numel"):
|
||||
c_coord = cutlass.conv.implicit_gemm_tensor_c_extent(
|
||||
self.conv_kind, problem_size)
|
||||
if (self.tensor_c_numel == c_coord.at(3) and
|
||||
self.tensor_c_numel < c_coord.size()):
|
||||
self.bias = True
|
||||
|
||||
#
|
||||
# initialize the argument
|
||||
#
|
||||
self.initialize()
|
||||
|
||||
# @typechecked
|
||||
def reorder_tensor_B(self, tensor_B: 'np.ndarray',
|
||||
problem_size: 'cutlass.conv.Conv2dProblemSize'):
|
||||
"""
|
||||
Reorder tensor_B for interleaved layout
|
||||
|
||||
:param tensor_B: input tensor B
|
||||
:type tensor_B: numpy.ndarray
|
||||
:param problem_size: Conv2d problem size
|
||||
:type problem_size: :class:`cutlass.conv.Conv2dProblemSize`
|
||||
|
||||
:return: reordered tensor B
|
||||
:rtype: numpy.ndarray
|
||||
"""
|
||||
reordered_tensor_B = np.empty_like(tensor_B)
|
||||
tensor_ref_B = self.get_tensor_ref(
|
||||
tensor_B, self.element_B, self.layout_B, problem_size, "b")
|
||||
reordered_tensor_ref_B = self.get_tensor_ref(
|
||||
reordered_tensor_B, self.element_B,
|
||||
self.layout_B, problem_size, "b")
|
||||
cutlass.conv.host.reorder_convK(
|
||||
reordered_tensor_ref_B, tensor_ref_B, self.conv_kind, problem_size)
|
||||
|
||||
return reordered_tensor_B
|
||||
|
||||
def get_tensor_ref(
|
||||
self, tensor, dtype, tensor_layout, problem_size, operand):
|
||||
if operand == "a":
|
||||
tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(
|
||||
self.conv_kind, problem_size)
|
||||
elif operand == "b":
|
||||
tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(
|
||||
self.conv_kind, problem_size)
|
||||
elif operand in ["c", "d"]:
|
||||
tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(
|
||||
self.conv_kind, problem_size)
|
||||
else:
|
||||
raise ValueError("unknown operand: " + operand)
|
||||
# Zero stride trick
|
||||
if operand == "c" and self.bias:
|
||||
tensor_coord = cutlass.Tensor4DCoord(0, 0, 0, 0)
|
||||
|
||||
layout = tensor_layout.packed(tensor_coord)
|
||||
|
||||
return TensorRef(tensor, dtype, layout).tensor_ref
|
||||
|
||||
def get_arguments(self, semaphore):
|
||||
ref_A = TensorRef_(self.get_tensor_ref(
|
||||
self.ptr_A, self.element_A, self.layout_A, self.problem_size, "a"))
|
||||
ref_B = TensorRef_(self.get_tensor_ref(
|
||||
self.ptr_B, self.element_B, self.layout_B, self.problem_size, "b"))
|
||||
ref_C = TensorRef_(self.get_tensor_ref(
|
||||
self.ptr_C, self.element_C, self.layout_C, self.problem_size, "c"))
|
||||
ref_D = TensorRef_(self.get_tensor_ref(
|
||||
self.ptr_D, self.element_C, self.layout_C, self.problem_size, "d"))
|
||||
|
||||
self.c_arguments = self.operation.argument_type(
|
||||
Conv2DProblemSize(self.problem_size),
|
||||
ref_A, ref_B, ref_C, ref_D, self.output_op, self.split_k_mode
|
||||
)
|
||||
|
||||
self.semaphore = semaphore
|
||||
|
||||
def initialize(self):
|
||||
"""
|
||||
Initialize the kernel arguments handling following stuffs
|
||||
1. get kernel launch configuration including grid, cta size,
|
||||
and dynamic shared memory capacity
|
||||
2. allocate and initialize device workspace
|
||||
3. get kernel params as bytearray for NVRTC input
|
||||
"""
|
||||
# get launch configuration
|
||||
self.launch_config = self.operation.rt_module.plan(self)
|
||||
|
||||
# allocate and initialize device workspace
|
||||
device_workspace_size = \
|
||||
self.operation.rt_module.get_device_workspace_size(self)
|
||||
|
||||
if device_workspace_size > 0:
|
||||
self.workspace_buffer = device_mem_alloc(device_workspace_size)
|
||||
workspace_ptr = self.workspace_buffer.ptr
|
||||
err, = cuda.cuMemsetD32(
|
||||
workspace_ptr, 0, device_workspace_size // 4)
|
||||
else:
|
||||
workspace_ptr = None
|
||||
|
||||
# get kernel params as bytearray
|
||||
semaphore = 0
|
||||
if workspace_ptr is not None and \
|
||||
self.split_k_mode == cutlass.conv.SplitKMode.Parallel:
|
||||
self.ptr_D = workspace_ptr
|
||||
elif workspace_ptr is not None and \
|
||||
self.split_k_mode == cutlass.conv.SplitKMode.Serial:
|
||||
semaphore = workspace_ptr
|
||||
|
||||
self.get_arguments(semaphore)
|
||||
|
||||
params_ = self.operation.rt_module.get_args(ctypes.byref(
|
||||
self.c_arguments), ctypes.c_void_p(int(self.semaphore)))
|
||||
self.host_workspace = bytearray(params_.contents)
|
||||
self.device_workspace = None
|
||||
|
||||
def sync(self):
|
||||
"""
|
||||
Synchronize the arguments. If the input tensor is in host,
|
||||
copy it from device to host.
|
||||
"""
|
||||
return super().sync()
|
||||
|
||||
|
||||
# @typechecked
|
||||
class Conv2dRT(ExecutableOperation):
|
||||
"""
|
||||
Conv2dRT manages the CUTLASS runtime components
|
||||
"""
|
||||
KernelTemplate = r'''
|
||||
extern "C"
|
||||
__global__ void
|
||||
${operation_name}(${operation_name}${operation_suffix}::Params params) {
|
||||
|
||||
// Dynamic shared memory base pointer
|
||||
extern __shared__ int SharedStorageBase[];
|
||||
|
||||
// Declare pointer to dynamic shared memory.
|
||||
${operation_name}${operation_suffix}::SharedStorage *shared_storage =
|
||||
reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
|
||||
|
||||
${operation_name}${operation_suffix} op;
|
||||
|
||||
op(params, *shared_storage);
|
||||
}
|
||||
'''
|
||||
|
||||
HostTemplate = r'''
|
||||
extern "C" {
|
||||
// Get the size of params in bytes
|
||||
int ${operation_name}_get_param_size(){
|
||||
return sizeof(${operation_name}${operation_suffix}::Params);
|
||||
}
|
||||
|
||||
// Get the size of dynamic shared memory in bytes
|
||||
int ${operation_name}_shared_memory_size() {
|
||||
return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
|
||||
}
|
||||
|
||||
// Get the params as byte array
|
||||
char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Arguments* arguments, int *semaphore=nullptr){
|
||||
typename ${operation_name}${operation_suffix}::Params* params;
|
||||
params = new ${operation_name}${operation_suffix}::Params(*arguments, semaphore);
|
||||
|
||||
char *bytes = ((char*)(params));
|
||||
char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
|
||||
for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
|
||||
output[i] = bytes[i];
|
||||
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
'''
|
||||
|
||||
def __init__(self, operation: 'Conv2dOperation'):
|
||||
super().__init__(operation)
|
||||
self.argument_type, self.epilogue_type = get_conv2d_arguments(operation.epilogue_functor)
|
||||
self.argtype = [ctypes.POINTER(self.argument_type), ctypes.c_void_p]
|
||||
self.conv_kind = operation.conv_kind
|
||||
|
||||
self.operation: Conv2dOperation = operation
|
||||
|
||||
self.emitter = EmitConv2dInstance('_type')
|
||||
|
||||
self.threads: int = operation.tile_description.num_threads
|
||||
|
||||
self.swizzle_functor = operation.swizzling_functor
|
||||
|
||||
def emit(self):
|
||||
return self.emitter.emit(self.operation)
|
||||
|
||||
# @typechecked
|
||||
def get_device_workspace_size(self, arguments: Conv2dArguments):
|
||||
workspace_bytes = 0
|
||||
|
||||
launch_config = arguments.launch_config
|
||||
|
||||
self.conv_kind = self.operation.conv_kind
|
||||
|
||||
if arguments.split_k_mode == cutlass.conv.SplitKMode.Parallel:
|
||||
problem_size = arguments.problem_size
|
||||
workspace_bytes = DataTypeSize[self.operation.C.element] \
|
||||
* launch_config.grid[2] * cutlass.conv.implicit_gemm_tensor_c_size(
|
||||
self.conv_kind, problem_size
|
||||
) // 8
|
||||
elif arguments.split_k_mode == cutlass.conv.SplitKMode.Serial and \
|
||||
arguments.split_k_slices > 1:
|
||||
workspace_bytes = launch_config.grid[0] * launch_config.grid[1] * 4
|
||||
|
||||
return workspace_bytes
|
||||
|
||||
# @typechecked
|
||||
def plan(self, arguments: Conv2dArguments):
|
||||
tile_size = cutlass.gemm.GemmCoord(
|
||||
self.operation.tile_description.threadblock_shape[0],
|
||||
self.operation.tile_description.threadblock_shape[1],
|
||||
self.operation.tile_description.threadblock_shape[2]
|
||||
)
|
||||
|
||||
grid = self.swizzle_functor.get_grid_shape(
|
||||
self.swizzle_functor.get_tiled_shape(
|
||||
self.conv_kind, arguments.problem_size,
|
||||
tile_size, arguments.split_k_slices
|
||||
)
|
||||
)
|
||||
return LaunchConfiguration(
|
||||
[grid.x, grid.y, grid.z], [self.threads, 1, 1],
|
||||
self.shared_memory_capacity)
|
||||
|
||||
def initialize(self):
|
||||
err, = cuda.cuFuncSetAttribute(
|
||||
self.kernel,
|
||||
attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
|
||||
value=self.shared_memory_capacity)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError('Cuda Error: {}'.format(err))
|
||||
|
||||
#
|
||||
|
||||
|
||||
class Conv2dOperation:
|
||||
"""
|
||||
CUTLASS Conv2d operation description.
|
||||
|
||||
:param conv_kind: convolution operator
|
||||
:type conv_kind: :class:`cutlass.conv.Operator`
|
||||
|
||||
:param iterator_algorithm: Selects among several implementation
|
||||
variants trading off performance with simplicity
|
||||
:type iterator_algorithm: :class:`cutlass.conv.IteratorAlgorithm`
|
||||
|
||||
:param arch: GPU compute capability (sm_xx)
|
||||
:type arch: int
|
||||
|
||||
:param tile_description: tile description
|
||||
:type tile_description: :class:`pycutlass.TileDescription`
|
||||
|
||||
:param A: tensor A description
|
||||
:type A: :class:`pycutlass.TensorDescription`
|
||||
|
||||
:param B: tensor B description
|
||||
:type B: :class:`pycutlass.TensorDescription`
|
||||
|
||||
:param C: tensor C description
|
||||
:type C: :class:`pycutlass.TensorDescription`
|
||||
|
||||
:param D: tensor D description
|
||||
:type D: :class:`pycutlass.TensorDescription`
|
||||
|
||||
:param element_epilogue: element type for computation in epilogue \
|
||||
:type element_epilogue: cutlass.int8 | cutlass.int32 | cutlass.float16 | \
|
||||
cutlass.bfloat16 | cutlass.float32 | cutlass.float64
|
||||
|
||||
:param stride_support: distinguish among partial specializations that \
|
||||
accelerate certain problems where convolution stride is unit \
|
||||
:type stride_support: :class:`cutlass.conv.StrideSupport`
|
||||
|
||||
:param epilogue_functor: convolution epilogue functor
|
||||
:type epilogue_functor: :class:`EpilogueFunctor`
|
||||
|
||||
:param swizzling_functor: threadblock swizzling functor
|
||||
"""
|
||||
#
|
||||
|
||||
def __init__(self,
|
||||
conv_kind: cutlass.conv.Operator,
|
||||
iterator_algorithm: cutlass.conv.IteratorAlgorithm,
|
||||
arch: int, tile_description: TileDescription,
|
||||
A: TensorDescription, B: TensorDescription, C: TensorDescription,
|
||||
stride_support, epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1):
|
||||
|
||||
self.operation_kind: OperationKind = OperationKind.Conv2d
|
||||
self.arch: int = arch
|
||||
self.tile_description: TileDescription = tile_description
|
||||
self.conv_kind = conv_kind
|
||||
self.A: TensorDescription = A
|
||||
self.B: TensorDescription = B
|
||||
self.C: TensorDescription = C
|
||||
self.epilogue_functor = epilogue_functor
|
||||
self.iterator_algorithm = iterator_algorithm
|
||||
self.stride_support = stride_support
|
||||
self.swizzling_functor = swizzling_functor()
|
||||
|
||||
self.rt_module: Conv2dRT = Conv2dRT(self)
|
||||
self.argument_type = self.rt_module.argument_type
|
||||
self.epilogue_type = self.rt_module.epilogue_type
|
||||
|
||||
def run(self, arguments: Conv2dArguments) -> cuda.CUresult:
|
||||
"""
|
||||
Launch the cuda kernel with input arguments
|
||||
|
||||
:param arguments: conv2d arguments
|
||||
:type arguments: :class:`pycutlass.Conv2dArguments`
|
||||
"""
|
||||
|
||||
# launch the kernel
|
||||
err = self.rt_module.run(
|
||||
arguments.host_workspace,
|
||||
arguments.device_workspace,
|
||||
arguments.launch_config)
|
||||
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError('CUDA Error %s' % str(err))
|
||||
|
||||
return err
|
||||
|
||||
#
|
||||
# Get function name
|
||||
#
|
||||
|
||||
def procedural_name(self):
|
||||
''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
|
||||
return self.configuration_name()
|
||||
#
|
||||
|
||||
def configuration_name(self):
|
||||
''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
|
||||
|
||||
opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
|
||||
|
||||
threadblock = "%dx%d_%dx%d" % (
|
||||
self.tile_description.threadblock_shape[0],
|
||||
self.tile_description.threadblock_shape[1],
|
||||
self.tile_description.threadblock_shape[2],
|
||||
self.tile_description.stages
|
||||
)
|
||||
|
||||
if self.stride_support == StrideSupport.Unity:
|
||||
configuration_name = "cutlass_sm${arch}_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride_align${alignment}"
|
||||
else:
|
||||
configuration_name = "cutlass_sm${arch}_${opcode_class}_${extended_name}_${threadblock}_${layout}_align${alignment}"
|
||||
|
||||
return SubstituteTemplate(
|
||||
configuration_name,
|
||||
{
|
||||
'arch': str(self.arch),
|
||||
'opcode_class': opcode_class_name,
|
||||
'extended_name': self.extended_name(),
|
||||
'threadblock': threadblock,
|
||||
'layout': self.layout_name(),
|
||||
'alignment': "%d" % self.A.alignment,
|
||||
}
|
||||
)
|
||||
|
||||
#
|
||||
def extended_name(self):
|
||||
''' Append data types if they differ from compute type. '''
|
||||
if self.C.element != self.tile_description.math_instruction.element_accumulator and \
|
||||
self.A.element != self.tile_description.math_instruction.element_accumulator:
|
||||
extended_name = "${element_c}_${core_name}_${element_a}"
|
||||
elif self.C.element == self.tile_description.math_instruction.element_accumulator and \
|
||||
self.A.element != self.tile_description.math_instruction.element_accumulator:
|
||||
extended_name = "${core_name}_${element_a}"
|
||||
else:
|
||||
extended_name = "${core_name}"
|
||||
|
||||
extended_name = SubstituteTemplate(extended_name, {
|
||||
'element_a': DataTypeNames[self.A.element],
|
||||
'element_c': DataTypeNames[self.C.element],
|
||||
'core_name': self.core_name()
|
||||
})
|
||||
|
||||
return extended_name
|
||||
|
||||
#
|
||||
def layout_name(self):
|
||||
return "%s" % (ShortLayoutTypeNames[self.A.layout])
|
||||
|
||||
#
|
||||
def core_name(self):
|
||||
''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
|
||||
|
||||
intermediate_type = ''
|
||||
|
||||
if self.tile_description.math_instruction.opcode_class == cutlass.OpClass.TensorOp:
|
||||
inst_shape = "%dx%dx%d" % tuple(
|
||||
self.tile_description.math_instruction.instruction_shape)
|
||||
if self.tile_description.math_instruction.element_a != self.A.element and \
|
||||
self.tile_description.math_instruction.element_a != self.accumulator_type():
|
||||
intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
|
||||
else:
|
||||
inst_shape = ''
|
||||
|
||||
return "%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()],
|
||||
inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
|
||||
|
||||
#
|
||||
def is_complex(self):
|
||||
complex_operators = [
|
||||
MathOperation.multiply_add_complex,
|
||||
MathOperation.multiply_add_complex_gaussian
|
||||
]
|
||||
return self.tile_description.math_instruction.math_operation in complex_operators
|
||||
|
||||
#
|
||||
def accumulator_type(self):
|
||||
accum = self.tile_description.math_instruction.element_accumulator
|
||||
|
||||
if self.is_complex():
|
||||
return get_complex_from_real(accum)
|
||||
|
||||
return accum
|
||||
|
||||
|
||||
###################################################################################################
|
||||
#
|
||||
# Emits single instances of a CUTLASS device-wide operator
|
||||
#
|
||||
###################################################################################################
|
||||
|
||||
class EmitConv2dInstance:
|
||||
def __init__(self, operation_suffix=''):
|
||||
self.operation_suffix = operation_suffix
|
||||
self.includes = [
|
||||
"cutlass/cutlass.h",
|
||||
"cutlass/conv/kernel/default_conv2d_fprop.h",
|
||||
"cutlass/conv/kernel/default_conv2d_dgrad.h",
|
||||
"cutlass/conv/kernel/default_conv2d_wgrad.h"
|
||||
]
|
||||
self.template = """
|
||||
// Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
|
||||
using ${operation_name}_base =
|
||||
typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
|
||||
${element_a},
|
||||
${layout_a},
|
||||
${element_b},
|
||||
${layout_b},
|
||||
${element_c},
|
||||
${layout_c},
|
||||
${element_accumulator},
|
||||
${opcode_class},
|
||||
${arch},
|
||||
cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
|
||||
cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
|
||||
cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
|
||||
${epilogue_functor},
|
||||
${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
|
||||
${stages},
|
||||
${math_operator},
|
||||
${iterator_algorithm},
|
||||
${stride_support},
|
||||
${align_a},
|
||||
${align_b}
|
||||
>::Kernel;
|
||||
|
||||
struct ${operation_name}${operation_suffix}:
|
||||
public ${operation_name}_base { };
|
||||
|
||||
"""
|
||||
|
||||
def emit(self, operation):
|
||||
|
||||
warp_shape = [int(operation.tile_description.threadblock_shape[idx] /
|
||||
operation.tile_description.warp_count[idx]) for idx in range(3)]
|
||||
|
||||
epilogue_vector_length = int(min(
|
||||
operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
|
||||
|
||||
values = {
|
||||
'operation_name': operation.procedural_name(),
|
||||
'operation_suffix': self.operation_suffix,
|
||||
'conv_kind': ConvKindTag[operation.conv_kind],
|
||||
'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
|
||||
'element_a': DataTypeTag[operation.A.element],
|
||||
'layout_a': LayoutTag[operation.A.layout],
|
||||
'element_b': DataTypeTag[operation.B.element],
|
||||
'layout_b': LayoutTag[operation.B.layout],
|
||||
'element_c': DataTypeTag[operation.C.element],
|
||||
'layout_c': LayoutTag[operation.C.layout],
|
||||
'element_accumulator': DataTypeTag[operation.accumulator_type()],
|
||||
'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
|
||||
'arch': "cutlass::arch::Sm%d" % operation.arch,
|
||||
'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
|
||||
'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
|
||||
'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
|
||||
'warp_shape_m': str(warp_shape[0]),
|
||||
'warp_shape_n': str(warp_shape[1]),
|
||||
'warp_shape_k': str(warp_shape[2]),
|
||||
'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
|
||||
'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
|
||||
'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
|
||||
'epilogue_vector_length': str(epilogue_vector_length),
|
||||
'epilogue_functor': operation.epilogue_functor.emit(),
|
||||
'swizzling_functor': operation.swizzling_functor.tag(),
|
||||
'stages': str(operation.tile_description.stages),
|
||||
'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
|
||||
'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
|
||||
'stride_support': StrideSupportTag[operation.stride_support],
|
||||
'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else
|
||||
MathOperationTag[operation.tile_description.math_instruction.math_operation],
|
||||
'align_a': str(operation.A.alignment),
|
||||
'align_b': str(operation.B.alignment),
|
||||
}
|
||||
|
||||
return SubstituteTemplate(self.template, values)
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,104 +0,0 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
import numpy as np
|
||||
from cuda import cuda
|
||||
from pycutlass.memory_manager import *
|
||||
from typing import TYPE_CHECKING
|
||||
try:
|
||||
import torch
|
||||
torch_available = True
|
||||
except ImportError:
|
||||
torch_available = False
|
||||
if TYPE_CHECKING:
|
||||
import torch
|
||||
|
||||
try:
|
||||
import cupy as cp
|
||||
cupy_available = True
|
||||
except ImportError:
|
||||
cupy_available = False
|
||||
if TYPE_CHECKING:
|
||||
import cupy as cp
|
||||
|
||||
|
||||
class NumpyFrontend:
|
||||
"""
|
||||
Frontend node for numpy
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def argument(np_tensor: 'np.ndarray', is_output: 'bool') -> cuda.CUdeviceptr:
|
||||
"""Convert the input numpy tensor to CUDA device pointer
|
||||
|
||||
:param np_tensor: input numpy nd array
|
||||
:param is_output: whether the tensor is output
|
||||
|
||||
:return: CUDA device pointer
|
||||
"""
|
||||
# copy the data to device
|
||||
if is_output:
|
||||
return device_mem_alloc(np_tensor.size * np_tensor.itemsize)
|
||||
else:
|
||||
return todevice(np_tensor)
|
||||
|
||||
|
||||
class TorchFrontend:
|
||||
"""
|
||||
Frontend node for torch
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def argument(torch_tensor: 'torch.Tensor') -> cuda.CUdeviceptr:
|
||||
"""Convert the input torch tensor to CUDA device pointer
|
||||
|
||||
:param torch_tensor: input torch tensor
|
||||
:param is_output: whether the tensor is output
|
||||
|
||||
:return: CUDA device pointer
|
||||
"""
|
||||
|
||||
# check the device of torch_tensor
|
||||
if not torch_tensor.is_cuda:
|
||||
torch_tensor = torch_tensor.to("cuda")
|
||||
|
||||
return cuda.CUdeviceptr(torch_tensor.data_ptr())
|
||||
|
||||
|
||||
class CupyFrontend:
|
||||
"""
|
||||
Frontend node for cupy
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def argument(cupy_ndarray: 'cp.ndarray'):
|
||||
return cuda.CUdeviceptr(int(cupy_ndarray.data.ptr))
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,870 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import re
|
||||
|
||||
###################################################################################################
|
||||
|
||||
import enum
|
||||
import cutlass
|
||||
import cute
|
||||
|
||||
# The following block implements enum.auto() for Python 3.5 variants that don't include it such
|
||||
# as the default 3.5.2 on Ubuntu 16.04.
|
||||
#
|
||||
# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility
|
||||
|
||||
try:
|
||||
from enum import auto as enum_auto
|
||||
except ImportError:
|
||||
__cutlass_library_auto_enum = 0
|
||||
|
||||
def enum_auto() -> int:
|
||||
global __cutlass_library_auto_enum
|
||||
i = __cutlass_library_auto_enum
|
||||
__cutlass_library_auto_enum += 1
|
||||
return i
|
||||
|
||||
###################################################################################################
|
||||
|
||||
#
|
||||
|
||||
|
||||
class GeneratorTarget(enum.Enum):
|
||||
Library = enum_auto()
|
||||
|
||||
#
|
||||
GeneratorTargetNames = {
|
||||
GeneratorTarget.Library: 'library',
|
||||
}
|
||||
#
|
||||
|
||||
###################################################################################################
|
||||
|
||||
#
|
||||
ShortDataTypeNames = {
|
||||
cutlass.int32: 'i',
|
||||
cutlass.float16: 'h',
|
||||
cutlass.float32: 's',
|
||||
cutlass.float64: 'd',
|
||||
cutlass.dtype.cf32: 'c',
|
||||
cutlass.dtype.cf64: 'z',
|
||||
}
|
||||
|
||||
#
|
||||
DataTypeNames = {
|
||||
cutlass.dtype.b1: "b1",
|
||||
cutlass.dtype.u4: "u4",
|
||||
cutlass.dtype.u8: "u8",
|
||||
cutlass.dtype.u16: "u16",
|
||||
cutlass.dtype.u32: "u32",
|
||||
cutlass.dtype.u64: "u64",
|
||||
cutlass.dtype.s4: "s4",
|
||||
cutlass.int8: "s8",
|
||||
cutlass.dtype.s16: "s16",
|
||||
cutlass.int32: "s32",
|
||||
cutlass.dtype.s64: "s64",
|
||||
cutlass.float16: "f16",
|
||||
cutlass.bfloat16: "bf16",
|
||||
cutlass.float32: "f32",
|
||||
cutlass.tfloat32: "tf32",
|
||||
cutlass.float64: "f64",
|
||||
cutlass.dtype.cf16: "cf16",
|
||||
cutlass.dtype.cbf16: "cbf16",
|
||||
cutlass.dtype.cf32: "cf32",
|
||||
cutlass.dtype.ctf32: "ctf32",
|
||||
cutlass.dtype.cf64: "cf64",
|
||||
cutlass.dtype.cu4: "cu4",
|
||||
cutlass.dtype.cu8: "cu8",
|
||||
cutlass.dtype.cu16: "cu16",
|
||||
cutlass.dtype.cu32: "cu32",
|
||||
cutlass.dtype.cu64: "cu64",
|
||||
cutlass.dtype.cs4: "cs4",
|
||||
cutlass.dtype.cs8: "cs8",
|
||||
cutlass.dtype.cs16: "cs16",
|
||||
cutlass.dtype.cs32: "cs32",
|
||||
cutlass.dtype.cs64: "cs64",
|
||||
}
|
||||
|
||||
DataTypeTag = {
|
||||
cutlass.dtype.b1: "cutlass::uint1b_t",
|
||||
cutlass.dtype.u4: "cutlass::uint4b_t",
|
||||
cutlass.dtype.u8: "uint8_t",
|
||||
cutlass.dtype.u16: "uint16_t",
|
||||
cutlass.dtype.u32: "uint32_t",
|
||||
cutlass.dtype.u64: "uint64_t",
|
||||
cutlass.dtype.s4: "cutlass::int4b_t",
|
||||
cutlass.int8: "int8_t",
|
||||
cutlass.dtype.s16: "int16_t",
|
||||
cutlass.int32: "int32_t",
|
||||
cutlass.dtype.s64: "int64_t",
|
||||
cutlass.float16: "cutlass::half_t",
|
||||
cutlass.bfloat16: "cutlass::bfloat16_t",
|
||||
cutlass.float32: "float",
|
||||
cutlass.tfloat32: "cutlass::tfloat32_t",
|
||||
cutlass.float64: "double",
|
||||
cutlass.dtype.cf16: "cutlass::complex<cutlass::half_t>",
|
||||
cutlass.dtype.cbf16: "cutlass::complex<cutlass::bfloat16_t>",
|
||||
cutlass.dtype.cf32: "cutlass::complex<float>",
|
||||
cutlass.dtype.ctf32: "cutlass::complex<cutlass::tfloat32_t>",
|
||||
cutlass.dtype.cf64: "cutlass::complex<double>",
|
||||
cutlass.dtype.cu4: "cutlass::complex<cutlass::uint4b_t>",
|
||||
cutlass.dtype.cu8: "cutlass::complex<cutlass::uint8_t>",
|
||||
cutlass.dtype.cu16: "cutlass::complex<cutlass::uint16_t>",
|
||||
cutlass.dtype.cu32: "cutlass::complex<cutlass::uint32_t>",
|
||||
cutlass.dtype.cu64: "cutlass::complex<cutlass::uint64_t>",
|
||||
cutlass.dtype.cs4: "cutlass::complex<cutlass::int4b_t>",
|
||||
cutlass.dtype.cs8: "cutlass::complex<cutlass::int8_t>",
|
||||
cutlass.dtype.cs16: "cutlass::complex<cutlass::int16_t>",
|
||||
cutlass.dtype.cs32: "cutlass::complex<cutlass::int32_t>",
|
||||
cutlass.dtype.cs64: "cutlass::complex<cutlass::int64_t>",
|
||||
}
|
||||
|
||||
DataTypeSize = {
|
||||
cutlass.dtype.b1: 1,
|
||||
cutlass.dtype.u4: 4,
|
||||
cutlass.dtype.u8: 8,
|
||||
cutlass.dtype.u16: 16,
|
||||
cutlass.dtype.u32: 32,
|
||||
cutlass.dtype.u64: 64,
|
||||
cutlass.dtype.s4: 4,
|
||||
cutlass.int8: 8,
|
||||
cutlass.dtype.s16: 16,
|
||||
cutlass.int32: 32,
|
||||
cutlass.dtype.s64: 64,
|
||||
cutlass.float16: 16,
|
||||
cutlass.bfloat16: 16,
|
||||
cutlass.float32: 32,
|
||||
cutlass.tfloat32: 32,
|
||||
cutlass.float64: 64,
|
||||
cutlass.dtype.cf16: 32,
|
||||
cutlass.dtype.cbf16: 32,
|
||||
cutlass.dtype.cf32: 64,
|
||||
cutlass.dtype.ctf32: 32,
|
||||
cutlass.dtype.cf64: 128,
|
||||
cutlass.dtype.cu4: 8,
|
||||
cutlass.dtype.cu8: 16,
|
||||
cutlass.dtype.cu16: 32,
|
||||
cutlass.dtype.cu32: 64,
|
||||
cutlass.dtype.cu64: 128,
|
||||
cutlass.dtype.cs4: 8,
|
||||
cutlass.dtype.cs8: 16,
|
||||
cutlass.dtype.cs16: 32,
|
||||
cutlass.dtype.cs32: 64,
|
||||
cutlass.dtype.cs64: 128,
|
||||
}
|
||||
|
||||
|
||||
class DataTypeSizeBytes:
|
||||
"""
|
||||
Static class to mimic the `DataTypeSize` dictionary, but with checks for whether the
|
||||
data type key is less than a full byte or a non-integer number of bytes.
|
||||
"""
|
||||
@staticmethod
|
||||
def __class_getitem__(datatype):
|
||||
"""
|
||||
Returns the number of bytes in size the data type is. Raises an exception if the data type
|
||||
is either less than a full byte or a non-integer number of bytes in size.
|
||||
|
||||
:param datatype: data type to query
|
||||
|
||||
:return: number of bytes the data type occupies
|
||||
:rtype: int
|
||||
"""
|
||||
bits = DataTypeSize[datatype]
|
||||
if bits < 8:
|
||||
raise Exception('Data type {} is less than one byte in size.'.format(datatype))
|
||||
elif bits % 8 != 0:
|
||||
raise Exception('Data type {} is not an integer number of bytes.'.format(datatype))
|
||||
return bits // 8
|
||||
|
||||
###################################################################################################
|
||||
#
|
||||
|
||||
|
||||
class BlasMode(enum.Enum):
|
||||
symmetric = enum_auto()
|
||||
hermitian = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
BlasModeTag = {
|
||||
BlasMode.symmetric: 'cutlass::BlasMode::kSymmetric',
|
||||
BlasMode.hermitian: 'cutlass::BlasMode::kHermitian',
|
||||
}
|
||||
|
||||
#
|
||||
ComplexTransformTag = {
|
||||
cutlass.complex_transform.none: 'cutlass::ComplexTransform::kNone',
|
||||
cutlass.complex_transform.conj: 'cutlass::ComplexTransform::kConjugate',
|
||||
}
|
||||
|
||||
#
|
||||
RealComplexBijection = [
|
||||
(cutlass.float16, cutlass.dtype.cf16),
|
||||
(cutlass.float32, cutlass.dtype.cf32),
|
||||
(cutlass.float64, cutlass.dtype.cf64),
|
||||
]
|
||||
|
||||
#
|
||||
|
||||
|
||||
def is_complex(data_type):
|
||||
for r, c in RealComplexBijection:
|
||||
if data_type == c:
|
||||
return True
|
||||
return False
|
||||
|
||||
#
|
||||
|
||||
|
||||
def get_complex_from_real(real_type):
|
||||
for r, c in RealComplexBijection:
|
||||
if real_type == r:
|
||||
return c
|
||||
return cutlass.dtype.invalid
|
||||
|
||||
#
|
||||
|
||||
|
||||
def get_real_from_complex(complex_type):
|
||||
for r, c in RealComplexBijection:
|
||||
if complex_type == c:
|
||||
return r
|
||||
return cutlass.dtype.invalid
|
||||
|
||||
#
|
||||
|
||||
|
||||
class ComplexMultiplyOp(enum.Enum):
|
||||
multiply_add = enum_auto()
|
||||
gaussian = enum_auto()
|
||||
|
||||
###################################################################################################
|
||||
|
||||
#
|
||||
|
||||
|
||||
class MathOperation(enum.Enum):
|
||||
multiply_add = enum_auto()
|
||||
multiply_add_saturate = enum_auto()
|
||||
xor_popc = enum_auto()
|
||||
multiply_add_fast_bf16 = enum_auto()
|
||||
multiply_add_fast_f16 = enum_auto()
|
||||
multiply_add_fast_f32 = enum_auto()
|
||||
multiply_add_complex_fast_f32 = enum_auto()
|
||||
multiply_add_complex = enum_auto()
|
||||
multiply_add_complex_gaussian = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
MathOperationNames = {
|
||||
MathOperation.multiply_add: 'multiply_add',
|
||||
MathOperation.multiply_add_saturate: 'multiply_add_saturate',
|
||||
MathOperation.xor_popc: 'xor_popc',
|
||||
MathOperation.multiply_add_fast_bf16: 'multiply_add_fast_bf16',
|
||||
MathOperation.multiply_add_fast_f16: 'multiply_add_fast_f16',
|
||||
MathOperation.multiply_add_fast_f32: 'multiply_add_fast_f32',
|
||||
MathOperation.multiply_add_complex_fast_f32: 'multiply_add_complex_fast_f32',
|
||||
MathOperation.multiply_add_complex: 'multiply_add_complex',
|
||||
MathOperation.multiply_add_complex_gaussian: 'multiply_add_complex_gaussian',
|
||||
}
|
||||
|
||||
#
|
||||
MathOperationTag = {
|
||||
MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd',
|
||||
MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate',
|
||||
MathOperation.xor_popc: 'cutlass::arch::OpXorPopc',
|
||||
MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16',
|
||||
MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16',
|
||||
MathOperation.multiply_add_fast_f32: 'cutlass::arch::OpMultiplyAddFastF32',
|
||||
MathOperation.multiply_add_complex_fast_f32: 'cutlass::arch::OpMultiplyAddComplexFastF32',
|
||||
MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex',
|
||||
MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex',
|
||||
}
|
||||
|
||||
###################################################################################################
|
||||
|
||||
#
|
||||
LayoutTag = {
|
||||
cutlass.ColumnMajor: 'cutlass::layout::ColumnMajor',
|
||||
cutlass.RowMajor: 'cutlass::layout::RowMajor',
|
||||
cutlass.layout.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>',
|
||||
cutlass.layout.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>',
|
||||
cutlass.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>',
|
||||
cutlass.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>',
|
||||
cutlass.layout.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>',
|
||||
cutlass.layout.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>',
|
||||
cutlass.TensorNHWC: 'cutlass::layout::TensorNHWC',
|
||||
cutlass.layout.TensorNDHWC: 'cutlass::layout::TensorNDHWC',
|
||||
cutlass.layout.TensorNCHW: 'cutlass::layout::TensorNCHW',
|
||||
cutlass.layout.TensorNGHWC: 'cutlass::layout::TensorNGHWC',
|
||||
cutlass.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>',
|
||||
cutlass.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>',
|
||||
cutlass.layout.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>',
|
||||
cutlass.layout.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>',
|
||||
}
|
||||
|
||||
#
|
||||
TransposedLayout = {
|
||||
cutlass.ColumnMajor: cutlass.RowMajor,
|
||||
cutlass.RowMajor: cutlass.ColumnMajor,
|
||||
cutlass.layout.ColumnMajorInterleaved2: cutlass.layout.RowMajorInterleaved2,
|
||||
cutlass.layout.RowMajorInterleaved2: cutlass.layout.ColumnMajorInterleaved2,
|
||||
cutlass.ColumnMajorInterleaved32: cutlass.RowMajorInterleaved32,
|
||||
cutlass.RowMajorInterleaved32: cutlass.ColumnMajorInterleaved32,
|
||||
cutlass.layout.ColumnMajorInterleaved64: cutlass.layout.RowMajorInterleaved64,
|
||||
cutlass.layout.RowMajorInterleaved64: cutlass.layout.ColumnMajorInterleaved64,
|
||||
cutlass.TensorNHWC: cutlass.TensorNHWC
|
||||
}
|
||||
|
||||
#
|
||||
ShortLayoutTypeNames = {
|
||||
cutlass.ColumnMajor: 'n',
|
||||
cutlass.layout.ColumnMajorInterleaved2: 'n2',
|
||||
cutlass.ColumnMajorInterleaved32: 'n32',
|
||||
cutlass.layout.ColumnMajorInterleaved64: 'n64',
|
||||
cutlass.RowMajor: 't',
|
||||
cutlass.layout.RowMajorInterleaved2: 't2',
|
||||
cutlass.RowMajorInterleaved32: 't32',
|
||||
cutlass.layout.RowMajorInterleaved64: 't64',
|
||||
cutlass.TensorNHWC: 'nhwc',
|
||||
cutlass.layout.TensorNDHWC: 'ndhwc',
|
||||
cutlass.layout.TensorNCHW: 'nchw',
|
||||
cutlass.layout.TensorNGHWC: 'nghwc',
|
||||
cutlass.TensorNC32HW32: 'nc32hw32',
|
||||
cutlass.layout.TensorNC64HW64: 'nc64hw64',
|
||||
cutlass.TensorC32RSK32: 'c32rsk32',
|
||||
cutlass.layout.TensorC64RSK64: 'c64rsk64'
|
||||
}
|
||||
|
||||
#
|
||||
ShortComplexLayoutNames = {
|
||||
(cutlass.ColumnMajor, cutlass.complex_transform.none): 'n',
|
||||
(cutlass.ColumnMajor, cutlass.complex_transform.conj): 'c',
|
||||
(cutlass.RowMajor, cutlass.complex_transform.none): 't',
|
||||
(cutlass.RowMajor, cutlass.complex_transform.conj): 'h'
|
||||
}
|
||||
|
||||
#
|
||||
CuTeLayoutTag = {
|
||||
cute.GMMAMajor.K: 'cute::GMMA::Major::K',
|
||||
cute.GMMAMajor.MN: 'cute::GMMA::Major::MN'
|
||||
}
|
||||
|
||||
###################################################################################################
|
||||
|
||||
#
|
||||
|
||||
|
||||
class SideMode(enum.Enum):
|
||||
Left = enum_auto()
|
||||
Right = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
SideModeTag = {
|
||||
SideMode.Left: 'cutlass::SideMode::kLeft',
|
||||
SideMode.Right: 'cutlass::SideMode::kRight'
|
||||
}
|
||||
|
||||
#
|
||||
ShortSideModeNames = {
|
||||
SideMode.Left: 'ls',
|
||||
SideMode.Right: 'rs'
|
||||
}
|
||||
|
||||
###################################################################################################
|
||||
|
||||
#
|
||||
|
||||
|
||||
class FillMode(enum.Enum):
|
||||
Lower = enum_auto()
|
||||
Upper = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
FillModeTag = {
|
||||
FillMode.Lower: 'cutlass::FillMode::kLower',
|
||||
FillMode.Upper: 'cutlass::FillMode::kUpper'
|
||||
}
|
||||
|
||||
#
|
||||
ShortFillModeNames = {
|
||||
FillMode.Lower: 'l',
|
||||
FillMode.Upper: 'u'
|
||||
}
|
||||
|
||||
###################################################################################################
|
||||
|
||||
#
|
||||
|
||||
|
||||
class DiagType(enum.Enum):
|
||||
NonUnit = enum_auto()
|
||||
Unit = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
DiagTypeTag = {
|
||||
DiagType.NonUnit: 'cutlass::DiagType::kNonUnit',
|
||||
DiagType.Unit: 'cutlass::DiagType::kUnit'
|
||||
}
|
||||
|
||||
#
|
||||
ShortDiagTypeNames = {
|
||||
DiagType.NonUnit: 'nu',
|
||||
DiagType.Unit: 'un'
|
||||
}
|
||||
|
||||
###################################################################################################
|
||||
|
||||
OpcodeClassNames = {
|
||||
cutlass.OpClass.Simt: 'simt',
|
||||
cutlass.OpClass.TensorOp: 'tensorop',
|
||||
cutlass.OpClass.WmmaTensorOp: 'wmma_tensorop',
|
||||
cutlass.OpClass.SparseTensorOp: 'sptensorop'
|
||||
}
|
||||
|
||||
OpcodeClassTag = {
|
||||
cutlass.OpClass.Simt: 'cutlass::arch::OpClassSimt',
|
||||
cutlass.OpClass.TensorOp: 'cutlass::arch::OpClassTensorOp',
|
||||
cutlass.OpClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp',
|
||||
cutlass.OpClass.SparseTensorOp: 'cutlass::arch::OpClassSparseTensorOp'
|
||||
}
|
||||
|
||||
###################################################################################################
|
||||
|
||||
#
|
||||
|
||||
class OperationKind(enum.Enum):
|
||||
Gemm = enum_auto()
|
||||
RankK = enum_auto()
|
||||
Rank2K = enum_auto()
|
||||
Trmm = enum_auto()
|
||||
Symm = enum_auto()
|
||||
Conv2d = enum_auto()
|
||||
Conv3d = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
OperationKindNames = {
|
||||
OperationKind.Gemm: 'gemm', OperationKind.RankK: 'rank_k', OperationKind.Rank2K: 'rank_2k', OperationKind.Trmm: 'trmm', OperationKind.Symm: 'symm', OperationKind.Conv2d: 'conv2d', OperationKind.Conv3d: 'conv3d'
|
||||
}
|
||||
|
||||
#
|
||||
ArchitectureNames = {
|
||||
50: 'maxwell',
|
||||
60: 'pascal',
|
||||
61: 'pascal',
|
||||
70: 'volta',
|
||||
75: 'turing',
|
||||
80: 'ampere',
|
||||
90: 'hopper'
|
||||
}
|
||||
|
||||
#
|
||||
SharedMemPerCC = {
|
||||
70: 96 << 10, # 96KB of SMEM
|
||||
72: 96 << 10, # 96KB of SMEM
|
||||
75: 64 << 10, # 64KB of SMEM
|
||||
80: 160 << 10, # 164KB of SMEM - 4KB reserved for the driver
|
||||
86: 100 << 10, # 100KB of SMEM
|
||||
87: 160 << 10, # 164KB of SMEM - 4KB reserved for the driver
|
||||
89: 100 << 10, # 100KB of SMEM
|
||||
90: 227 << 10, # 228KB of SMEM - 1KB reserved for the driver
|
||||
}
|
||||
|
||||
###################################################################################################
|
||||
|
||||
class GemmKind(enum.Enum):
|
||||
Gemm = enum_auto()
|
||||
Sparse = enum_auto()
|
||||
Universal = enum_auto()
|
||||
PlanarComplex = enum_auto()
|
||||
PlanarComplexArray = enum_auto()
|
||||
Grouped = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
GemmKindNames = {
|
||||
GemmKind.Gemm: "gemm",
|
||||
GemmKind.Sparse: "spgemm",
|
||||
GemmKind.Universal: "gemm",
|
||||
GemmKind.PlanarComplex: "gemm_planar_complex",
|
||||
GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
|
||||
GemmKind.Grouped: "gemm_grouped"
|
||||
}
|
||||
|
||||
#
|
||||
|
||||
|
||||
class RankKKind(enum.Enum):
|
||||
Universal = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
RankKKindNames = {
|
||||
RankKKind.Universal: "rank_k"
|
||||
}
|
||||
|
||||
#
|
||||
|
||||
|
||||
class TrmmKind(enum.Enum):
|
||||
Universal = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
TrmmKindNames = {
|
||||
TrmmKind.Universal: "trmm"
|
||||
}
|
||||
|
||||
#
|
||||
|
||||
|
||||
class SymmKind(enum.Enum):
|
||||
Universal = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
SymmKindNames = {
|
||||
SymmKind.Universal: "symm"
|
||||
}
|
||||
|
||||
#
|
||||
|
||||
|
||||
class SwizzlingFunctor(enum.Enum):
|
||||
Identity1 = enum_auto()
|
||||
Identity2 = enum_auto()
|
||||
Identity4 = enum_auto()
|
||||
Identity8 = enum_auto()
|
||||
Horizontal = enum_auto()
|
||||
BatchedIdentity1 = enum_auto()
|
||||
StridedDgradIdentity1 = enum_auto()
|
||||
StridedDgradIdentity4 = enum_auto()
|
||||
StridedDgradHorizontal = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
SwizzlingFunctorTag = {
|
||||
cutlass.IdentitySwizzle1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>',
|
||||
SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>',
|
||||
SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>',
|
||||
SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>',
|
||||
SwizzlingFunctor.Horizontal: 'cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle',
|
||||
SwizzlingFunctor.BatchedIdentity1: "cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle",
|
||||
SwizzlingFunctor.StridedDgradIdentity1: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>',
|
||||
SwizzlingFunctor.StridedDgradIdentity4: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>',
|
||||
SwizzlingFunctor.StridedDgradHorizontal: 'cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle',
|
||||
}
|
||||
|
||||
#
|
||||
|
||||
|
||||
class SchedulerMode(enum.Enum):
|
||||
Device = enum_auto(),
|
||||
Host = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
SchedulerModeTag = {
|
||||
SchedulerMode.Device: 'cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly',
|
||||
SchedulerMode.Host: 'cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute'
|
||||
}
|
||||
|
||||
#
|
||||
ShortSchedulerModeNames = {
|
||||
SchedulerMode.Device: 'Device',
|
||||
SchedulerMode.Host: 'Host'
|
||||
}
|
||||
|
||||
###################################################################################################
|
||||
|
||||
|
||||
#
|
||||
ConvKindTag = {
|
||||
cutlass.conv.Operator.fprop: 'cutlass::conv::Operator::kFprop',
|
||||
cutlass.conv.Operator.dgrad: 'cutlass::conv::Operator::kDgrad',
|
||||
cutlass.conv.Operator.wgrad: 'cutlass::conv::Operator::kWgrad'
|
||||
}
|
||||
|
||||
ConvKindNames = {
|
||||
cutlass.conv.Operator.fprop: 'fprop',
|
||||
cutlass.conv.Operator.dgrad: 'dgrad',
|
||||
cutlass.conv.Operator.wgrad: 'wgrad',
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
IteratorAlgorithmTag = {
|
||||
cutlass.conv.IteratorAlgorithm.analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic',
|
||||
cutlass.conv.IteratorAlgorithm.optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized',
|
||||
cutlass.conv.IteratorAlgorithm.fixed_channels: 'cutlass::conv::IteratorAlgorithm::kFixedChannels',
|
||||
cutlass.conv.IteratorAlgorithm.few_channels: 'cutlass::conv::IteratorAlgorithm::kFewChannels'
|
||||
}
|
||||
|
||||
IteratorAlgorithmNames = {
|
||||
cutlass.conv.IteratorAlgorithm.analytic: 'analytic',
|
||||
cutlass.conv.IteratorAlgorithm.optimized: 'optimized',
|
||||
cutlass.conv.IteratorAlgorithm.fixed_channels: 'fixed_channels',
|
||||
cutlass.conv.IteratorAlgorithm.few_channels: 'few_channels'
|
||||
}
|
||||
|
||||
#
|
||||
|
||||
|
||||
class StrideSupport(enum.Enum):
|
||||
Strided = enum_auto()
|
||||
Unity = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
StrideSupportTag = {
|
||||
StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided',
|
||||
StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity',
|
||||
}
|
||||
|
||||
StrideSupportNames = {
|
||||
StrideSupport.Strided: '',
|
||||
StrideSupport.Unity: 'unity_stride',
|
||||
}
|
||||
|
||||
|
||||
class ConvMode(enum.Enum):
|
||||
CrossCorrelation = enum_auto()
|
||||
Convolution = enum_auto()
|
||||
|
||||
|
||||
#
|
||||
ConvModeTag = {
|
||||
ConvMode.CrossCorrelation: 'cutlass::conv::Mode::kCrossCorrelation',
|
||||
ConvMode.Convolution: 'cutlass::conv::Mode::kConvolution'
|
||||
}
|
||||
|
||||
###################################################################################################
|
||||
|
||||
#
|
||||
|
||||
|
||||
class MathInstruction:
|
||||
"""
|
||||
Description of a the lowest-level matrix-multiply-accumulate operation to be used in a kernel
|
||||
"""
|
||||
def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class=cutlass.OpClass.Simt, math_operation=MathOperation.multiply_add):
|
||||
"""
|
||||
:param instruction_shape: size of the [M, N, K] dimensions of the instruction
|
||||
:type instruction_shape: list or tuple
|
||||
:param element_a: data type of operand A
|
||||
:param element_b: data type of operand B
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:param opcode_class: higher-level class of the instruction (e.g., SIMT or Tensor Core)
|
||||
:type opcode_class: cutlass.OpClass
|
||||
:param math_operation: the type of low-level operation to be performed (e.g., multiply accumulate)
|
||||
:type math_operation: MathOperation
|
||||
"""
|
||||
self.instruction_shape = instruction_shape
|
||||
self.element_a = element_a
|
||||
self.element_b = element_b
|
||||
self.element_accumulator = element_accumulator
|
||||
self.opcode_class = opcode_class
|
||||
self.math_operation = math_operation
|
||||
|
||||
#
|
||||
|
||||
|
||||
class TileDescription:
|
||||
"""
|
||||
Description of a tile of computation to be performed in the kernel, encompassing threadblock, cluster, and warp shapes,
|
||||
stage count, and math instruction specification
|
||||
"""
|
||||
def __init__(self, threadblock_shape, stages, warp_count, math_instruction, cluster_shape=[1, 1, 1], persistent=False):
|
||||
"""
|
||||
:param threadblock_shape: shape of a threadblock tyle
|
||||
:type threadblock_shape: list or tuple
|
||||
:param stages: number of pipeline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum
|
||||
number of stages that can be supported for an operation on a given architecture will be computed at a later time
|
||||
:type stages: int or None
|
||||
:param warp_count: number of warps in each [M, N, K] dimension of a threadblock tile
|
||||
:type warp_count: list, tuple, or None
|
||||
:param math_instruction: specification of the instruction type and shape to be performed and the types of its operands
|
||||
:type math_instruction: MathInstruction
|
||||
:param cluster_shape: number of threadblocks in the [X, Y, Z] dimensions of a threadblock cluster
|
||||
:param persistent: whether the kernel uses persistent warp-specialized threadblocks (only available for SM90+)
|
||||
:type persistent: bool
|
||||
"""
|
||||
self.threadblock_shape = threadblock_shape
|
||||
self.cluster_shape = cluster_shape
|
||||
self.persistent: bool = persistent
|
||||
self.stages: int = stages
|
||||
|
||||
self.math_instruction = math_instruction
|
||||
|
||||
# Number of warps along x, y, z directions
|
||||
self.warp_count = warp_count
|
||||
|
||||
@property
|
||||
def num_threads(self):
|
||||
"""
|
||||
Returns the number of threads in the threadblock
|
||||
|
||||
:return: number of threads in the threadblock
|
||||
:rtype: int or None (if warp count is None)
|
||||
"""
|
||||
if self.warp_count is not None:
|
||||
threads = 32
|
||||
for cnt in self.warp_count:
|
||||
threads *= cnt
|
||||
return threads
|
||||
return None
|
||||
|
||||
def procedural_name(self):
|
||||
"""
|
||||
Returns a name identifying the tile description
|
||||
|
||||
:return: name identifying the tile description
|
||||
:rtype: int
|
||||
"""
|
||||
emit_stages = 0 if self.stages is None else self.stages
|
||||
name = "%dx%dx%d_%dx%d_%dx%d" % (
|
||||
self.cluster_shape[0], self.cluster_shape[1], self.cluster_shape[2],
|
||||
self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], emit_stages)
|
||||
|
||||
if self.persistent:
|
||||
name += '_persistent'
|
||||
return name
|
||||
|
||||
#
|
||||
|
||||
|
||||
class TensorDescription:
|
||||
def __init__(self, element, layout, alignment=1, complex_transform=cutlass.complex_transform.none):
|
||||
self.element = element
|
||||
self.layout = layout
|
||||
self.alignment = min(128 // DataTypeSize[self.element], alignment)
|
||||
self.complex_transform = complex_transform
|
||||
|
||||
#
|
||||
|
||||
|
||||
class SymmetricTensorDescription:
|
||||
def __init__(self, element, layout, fill_mode, alignment=1, complex_transform=cutlass.complex_transform.none, side_mode=SideMode.Left):
|
||||
self.element = element
|
||||
self.layout = layout
|
||||
self.fill_mode = fill_mode
|
||||
self.alignment = alignment
|
||||
self.complex_transform = complex_transform
|
||||
self.side_mode = side_mode
|
||||
|
||||
#
|
||||
|
||||
|
||||
class TriangularTensorDescription:
|
||||
def __init__(self, element, layout, side_mode, fill_mode, diag_type, alignment=1, complex_transform=cutlass.complex_transform.none):
|
||||
self.element = element
|
||||
self.layout = layout
|
||||
self.side_mode = side_mode
|
||||
self.fill_mode = fill_mode
|
||||
self.diag_type = diag_type
|
||||
self.alignment = alignment
|
||||
self.complex_transform = complex_transform
|
||||
|
||||
###################################################################################################
|
||||
|
||||
#
|
||||
def CalculateSmemUsagePerStage(operation):
|
||||
"""
|
||||
Returns the amount of shared memory in bytes consumed in a single stage of a kernel.
|
||||
|
||||
:param op: operation for which the maximum stages should be computed. If stages are
|
||||
set via the `op.tile_description.stages` parameter, this setting is ignored
|
||||
in the present calculation
|
||||
:type op: pycutlass.Operation
|
||||
|
||||
:return: number of bytes of shared memory consumed by a single stage
|
||||
:rtype: int
|
||||
"""
|
||||
m, n, k = operation.tile_description.threadblock_shape
|
||||
|
||||
if operation.operation_kind == OperationKind.Gemm:
|
||||
stage_barrier_bytes = 32
|
||||
return (DataTypeSize[operation.A.element] * m * k // 8) + \
|
||||
(DataTypeSize[operation.B.element] * k * n // 8) + stage_barrier_bytes
|
||||
else:
|
||||
raise Exception('Unsupported operation kind {}.'.format(operation.operation_kind))
|
||||
|
||||
|
||||
#
|
||||
def CalculateSmemUsage(operation):
|
||||
"""
|
||||
Returns the amount of shared memory in bytes consumed by a kernel.
|
||||
|
||||
:param op: operation for which the maximum stages should be computed. If stages are
|
||||
set via the `op.tile_description.stages` parameter, this setting is ignored
|
||||
in the present calculation
|
||||
:type op: pycutlass.Operation
|
||||
|
||||
:return: int
|
||||
"""
|
||||
return operation.tile_description.stages * CalculateSmemUsagePerStage(operation)
|
||||
|
||||
|
||||
class ApiVersion(enum.Enum):
|
||||
"""
|
||||
Differentiate between CUTLASS 2.x and 3.x API versions
|
||||
"""
|
||||
v2x = enum_auto()
|
||||
v3x = enum_auto()
|
||||
|
||||
|
||||
def api_version(arch, opclass, datatype):
|
||||
"""
|
||||
Returns whether the architecture, opcode class, and datatype in question require using CUTLASS 2.x
|
||||
or 3.x for code emission.
|
||||
|
||||
:param arch: compute capability of device on which to run
|
||||
:type arch: int
|
||||
:param opclass: class of the operation being performed
|
||||
:type opclass: cutlass.OpClass
|
||||
:param datatype: data type to be used in operation (assumes that ElementA and ElementB are the same)
|
||||
|
||||
:return: API version to be used in code emission
|
||||
:rtype: ApiVersion
|
||||
"""
|
||||
if arch >= 90 and opclass == cutlass.OpClass.TensorOp and (datatype != cutlass.float64):
|
||||
return ApiVersion.v3x
|
||||
else:
|
||||
return ApiVersion.v2x
|
||||
|
||||
###################################################################################################
|
||||
@ -1,74 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import rmm
|
||||
import numpy as np
|
||||
|
||||
|
||||
class PoolMemoryManager:
|
||||
def __init__(self, init_pool_size: int, max_pool_size: int) -> None:
|
||||
self.pool = rmm.mr.PoolMemoryResource(
|
||||
rmm.mr.CudaMemoryResource(),
|
||||
initial_pool_size=init_pool_size,
|
||||
maximum_pool_size=max_pool_size
|
||||
)
|
||||
self.mr = rmm.mr.TrackingResourceAdaptor(self.pool)
|
||||
rmm.mr.set_current_device_resource(self.mr)
|
||||
|
||||
def get_allocated_size(self):
|
||||
return self.mr.get_allocated_bytes()
|
||||
|
||||
def pool_size(self):
|
||||
return self.pool.pool_size()
|
||||
|
||||
|
||||
def todevice(host_data, dtype=np.float32):
|
||||
"""
|
||||
Pass the host_data to device memory
|
||||
"""
|
||||
if isinstance(host_data, list):
|
||||
return rmm.DeviceBuffer.to_device(np.array(host_data, dtype=dtype).tobytes())
|
||||
elif isinstance(host_data, np.ndarray):
|
||||
return rmm.DeviceBuffer.to_device(host_data.tobytes())
|
||||
|
||||
|
||||
def device_mem_alloc(size):
|
||||
return rmm.DeviceBuffer(size=size)
|
||||
|
||||
|
||||
def align_size(size, alignment=256):
|
||||
return ((size + alignment - 1) // alignment) * alignment
|
||||
|
||||
|
||||
def get_allocated_size():
|
||||
device_resource = rmm.mr.get_current_device_resource()
|
||||
return device_resource.get_allocated_bytes()
|
||||
@ -1,153 +0,0 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
import ctypes
|
||||
from cuda import cuda
|
||||
from pycutlass.utils.device import device_cc
|
||||
|
||||
from cuda import __version__ as __cuda_version__
|
||||
_version_splits = [int(x) for x in __cuda_version__.split('.')]
|
||||
supports_cluster_launch = device_cc() >= 90 and (_version_splits[0] > 11 or (_version_splits[0] == 11 and _version_splits[1] >= 8))
|
||||
|
||||
|
||||
################################################################################
|
||||
#
|
||||
# Launch configuration
|
||||
#
|
||||
################################################################################
|
||||
|
||||
|
||||
class LaunchConfiguration:
|
||||
def __init__(self, grid=[1, 1, 1], block=[1, 1, 1], smem=0):
|
||||
self.grid = grid
|
||||
self.block = block
|
||||
self.shared_memory_capacity = smem
|
||||
|
||||
|
||||
################################################################################
|
||||
#
|
||||
# Base class for an executable operation
|
||||
#
|
||||
# ##############################################################################
|
||||
|
||||
class ExecutableOperation:
|
||||
'''
|
||||
'''
|
||||
|
||||
def __init__(self, operation):
|
||||
self.operation = operation
|
||||
self.module = None
|
||||
self.kernel = None
|
||||
|
||||
#
|
||||
def name(self):
|
||||
return self.operation.procedural_name()
|
||||
|
||||
#
|
||||
def emit(self):
|
||||
return ''
|
||||
|
||||
#
|
||||
def can_implement(self, configuration, arguments):
|
||||
raise NotImplementedError()
|
||||
|
||||
#
|
||||
def get_host_workspace_size(self, arguments):
|
||||
raise NotImplementedError()
|
||||
|
||||
#
|
||||
def get_device_workspace_size(self, arguments):
|
||||
raise NotImplementedError()
|
||||
|
||||
#
|
||||
def plan(self, arguments):
|
||||
raise NotImplementedError()
|
||||
|
||||
#
|
||||
def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream=cuda.CUstream(0)):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
#
|
||||
def run_with_clusters(self, launch_config, kernel_params, stream=cuda.CUstream(0)):
|
||||
if hasattr(self.operation, 'tile_description') and hasattr(self.operation.tile_description, 'cluster_shape'):
|
||||
attr = cuda.CUlaunchAttribute()
|
||||
attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.operation.tile_description.cluster_shape
|
||||
attr.id = cuda.CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
|
||||
attrs = [attr]
|
||||
|
||||
# Allow for non-portable cluster sizes
|
||||
err, = cuda.cuFuncSetAttribute(
|
||||
self.kernel, cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
return err
|
||||
else:
|
||||
attrs = []
|
||||
|
||||
config = cuda.CUlaunchConfig()
|
||||
config.gridDimX, config.gridDimY, config.gridDimZ = launch_config.grid
|
||||
config.blockDimX, config.blockDimY, config.blockDimZ = launch_config.block
|
||||
config.blockDimZ = launch_config.block[2]
|
||||
config.sharedMemBytes = launch_config.shared_memory_capacity
|
||||
config.hStream = stream
|
||||
config.attrs = attrs
|
||||
config.numAttrs = len(attrs)
|
||||
|
||||
err, = cuda.cuLaunchKernelEx(config, f=self.kernel, kernelParams=kernel_params, extra=0)
|
||||
return err
|
||||
|
||||
|
||||
#
|
||||
def run_without_clusters(self, launch_config, kernel_params, stream=cuda.CUstream(0)):
|
||||
err, = cuda.cuLaunchKernel(
|
||||
self.kernel,
|
||||
launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
|
||||
launch_config.block[0], launch_config.block[1], launch_config.block[2],
|
||||
launch_config.shared_memory_capacity,
|
||||
stream,
|
||||
kernel_params,
|
||||
0)
|
||||
|
||||
return err
|
||||
|
||||
|
||||
#
|
||||
def run(self, host_workspace, device_workspace, launch_config, stream=cuda.CUstream(0)):
|
||||
cArg = (ctypes.c_char * len(host_workspace)
|
||||
).from_buffer(host_workspace)
|
||||
packed = (ctypes.c_void_p * 1)()
|
||||
packed[0] = ctypes.addressof(cArg)
|
||||
|
||||
if supports_cluster_launch:
|
||||
return self.run_with_clusters(launch_config, packed, stream)
|
||||
else:
|
||||
return self.run_without_clusters(launch_config, packed, stream)
|
||||
@ -1,614 +0,0 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
from typing import Generic, TypeVar
|
||||
from treelib import Tree
|
||||
import numpy as np
|
||||
|
||||
from pycutlass import *
|
||||
import pycutlass
|
||||
|
||||
import ast
|
||||
import textwrap
|
||||
import inspect
|
||||
|
||||
################################################################################
|
||||
# Type annotation for input arguments
|
||||
################################################################################
|
||||
|
||||
Ttype = TypeVar("Ttype")
|
||||
Dtype = TypeVar("Dtype")
|
||||
|
||||
class NDArray(np.ndarray, Generic[Ttype, Dtype]):
|
||||
pass
|
||||
|
||||
################################################################################
|
||||
# Operations
|
||||
################################################################################
|
||||
|
||||
operators = {
|
||||
ast.Add: "Add",
|
||||
ast.Div: "Div",
|
||||
ast.Eq: "Equal",
|
||||
ast.Mult: "Mult"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# AST Node abstractions
|
||||
################################################################################
|
||||
class UnaryNode:
|
||||
cnt = 0
|
||||
# Concept: this is created by the BinOp Node in python ast
|
||||
def __init__(self,
|
||||
element_accumulator, element_compute, elements_per_access,
|
||||
node, args) -> None:
|
||||
if isinstance(node, BinOpNode):
|
||||
self.op = node.op
|
||||
elif isinstance(node, ast.Call):
|
||||
if isinstance(node.func, ast.Name):
|
||||
self.op = node.func.id
|
||||
elif isinstance(node.func, ast.Attribute):
|
||||
self.op = node.func.value.id
|
||||
else:
|
||||
raise TypeError
|
||||
else:
|
||||
raise TypeError
|
||||
self.tag = "Unary" + self.op + str(UnaryNode.cnt)
|
||||
self.id = self.op + str(UnaryNode.cnt)
|
||||
self.args = args
|
||||
UnaryNode.cnt += 1
|
||||
|
||||
self.type = "tensor"
|
||||
|
||||
self.epilogue_op = getattr(pycutlass, self.op)(element_compute)
|
||||
|
||||
# data types
|
||||
self.element_accumulator = element_accumulator
|
||||
self.element_compute = element_compute
|
||||
self.elements_per_access = elements_per_access
|
||||
|
||||
def get_epilogue_node(self, visitors):
|
||||
self.epilogue_node = UnaryOp(
|
||||
self.element_accumulator, self.element_compute,
|
||||
self.elements_per_access, *visitors, self.epilogue_op)
|
||||
|
||||
def get_argument(self, visitor_args, kwargs):
|
||||
epilogue_ops = []
|
||||
for arg in self.args:
|
||||
try:
|
||||
epilogue_ops.append(kwargs[arg])
|
||||
except:
|
||||
epilogue_ops.append(arg) # direct arguments like constant
|
||||
self.argument = self.epilogue_node.argument_type(self.epilogue_op.argument_type(*epilogue_ops), *visitor_args)
|
||||
|
||||
|
||||
class BinOpNode:
|
||||
cnt = 0
|
||||
# Concept: this is created by the BinOp Node in python ast
|
||||
def __init__(self,
|
||||
element_accumulator, element_compute, elements_per_access,
|
||||
node) -> None:
|
||||
self.op = operators[type(node.op)]
|
||||
self.tag = "Binary" + self.op + str(BinOpNode.cnt)
|
||||
self.id = self.op + str(BinOpNode.cnt)
|
||||
self.args = None
|
||||
BinOpNode.cnt += 1
|
||||
|
||||
self.type = "tensor"
|
||||
|
||||
self.epilogue_op = getattr(pycutlass, "Vector"+self.op)(element_compute)
|
||||
|
||||
# data types
|
||||
self.element_accumulator = element_accumulator
|
||||
self.element_compute = element_compute
|
||||
self.elements_per_access = elements_per_access
|
||||
|
||||
def get_epilogue_node(self, visitors):
|
||||
self.epilogue_node = BinaryOp(
|
||||
self.element_accumulator, self.element_compute,
|
||||
self.elements_per_access, *visitors, self.epilogue_op)
|
||||
|
||||
def get_argument(self, visitor_args, kwargs):
|
||||
self.argument = self.epilogue_node.argument_type(self.epilogue_op.argument_type(self.args), *visitor_args)
|
||||
|
||||
|
||||
class NameNode:
|
||||
# Concept: this is created by the Name Node in python ast
|
||||
def __init__(self, node) -> None:
|
||||
try:
|
||||
self.id = node.id
|
||||
except:
|
||||
self.id = node.targets[0].id
|
||||
self.tag = self.id
|
||||
|
||||
class ScalarInputNode(NameNode):
|
||||
# Concept: scalar
|
||||
def __init__(self, node) -> None:
|
||||
super().__init__(node)
|
||||
self.tag = "Scalar:" + self.tag
|
||||
self.type = "scalar"
|
||||
|
||||
class AccumulatorNode(NameNode):
|
||||
# Concept: VisitorOpAccumulator
|
||||
def __init__(self,
|
||||
element_accumulator, elements_per_access, node) -> None:
|
||||
super().__init__(node)
|
||||
self.tag = "Accum:" + self.tag
|
||||
self.type = "tensor"
|
||||
|
||||
self.element_accumulator = element_accumulator
|
||||
self.elements_per_access = elements_per_access
|
||||
|
||||
def get_epilogue_node(self, visitors):
|
||||
self.epilogue_node = AccumulatorOp(
|
||||
self.element_accumulator, self.elements_per_access)
|
||||
|
||||
def get_argument(self, visitor_args, kwargs):
|
||||
self.argument = self.epilogue_node.argument_type()
|
||||
|
||||
class TensorInputNode(NameNode):
|
||||
# Concept: VisitorOpTensorInput
|
||||
def __init__(self, element_accumulator, node) -> None:
|
||||
super().__init__(node)
|
||||
self.tag = "TensorInput:" + self.tag
|
||||
self.type = "tensor"
|
||||
self.element_accumulator = element_accumulator
|
||||
|
||||
def get_epilogue_node(self, *args):
|
||||
self.epilogue_node = TensorInputOp(self.element_accumulator)
|
||||
|
||||
def get_argument(self, visitor_args, kwargs):
|
||||
self.argument = self.epilogue_node.argument_type(
|
||||
kwargs[self.id + "_ptr"], kwargs["problem_size"][1],
|
||||
kwargs["problem_size"][0] * kwargs["problem_size"][1])
|
||||
|
||||
class RowBroadcastNode(NameNode):
|
||||
# Concept: VisitorOpRowBroadcast
|
||||
def __init__(self, element_accumulator, element_fragment, node) -> None:
|
||||
super().__init__(node)
|
||||
#
|
||||
self.tag = "RowBroadcast:" + self.tag
|
||||
self.type = "tensor"
|
||||
self.element_accumulator = element_accumulator
|
||||
self.element_fragment = element_fragment
|
||||
|
||||
def get_epilogue_node(self, *args):
|
||||
self.epilogue_node = RowBroadcastOp(
|
||||
self.element_accumulator, self.element_fragment)
|
||||
|
||||
def get_argument(self, visitor_args, kwargs):
|
||||
self.argument = self.epilogue_node.argument_type(kwargs[self.id + "_ptr"], kwargs["problem_size"][1])
|
||||
|
||||
class ColumnBroadcastNode(NameNode):
|
||||
# Concept: VisitorOpColumnBroadcast
|
||||
def __init__(self, element_accumulator, element_fragment, node) -> None:
|
||||
super().__init__(node)
|
||||
self.tag = "ColumnBroadcast:" + self.tag
|
||||
self.type = "tensor"
|
||||
self.element_accumulator = element_accumulator
|
||||
self.element_fragment = element_fragment
|
||||
|
||||
def get_epilogue_node(self, *args):
|
||||
self.epilogue_node = ColumnBroadcastOp(
|
||||
self.element_accumulator, self.element_fragment)
|
||||
|
||||
def get_argument(self, visitor_args, kwargs):
|
||||
self.argument = self.epilogue_node.argument_type(kwargs[self.id + "_ptr"], kwargs["problem_size"][0])
|
||||
|
||||
class TensorOutputNode(NameNode):
|
||||
# Concept: VisitorOpTensorOutput
|
||||
def __init__(self, element_accumulator, node) -> None:
|
||||
super().__init__(node)
|
||||
self.tag = "TensorOutput:" + self.tag
|
||||
self.type = "tensor"
|
||||
self.element_accumulator = element_accumulator
|
||||
|
||||
def get_epilogue_node(self, visitors):
|
||||
self.epilogue_node = TensorOutputOp(self.element_accumulator, *visitors)
|
||||
|
||||
def get_argument(self, visitor_args, kwargs):
|
||||
self.argument = self.epilogue_node.argument_type(kwargs[self.id + "_ptr"], kwargs["problem_size"][1], *visitor_args, kwargs["problem_size"][0] * kwargs["problem_size"][1])
|
||||
|
||||
class RowReductionNode:
|
||||
# Concept: RowReductionOp
|
||||
def __init__(self, element_accumulator, element_reduction,
|
||||
element_reduction_accumulator, id, factor) -> None:
|
||||
#
|
||||
self.id = id
|
||||
self.tag = "RowReduction:" + self.id
|
||||
self.type = "tensor"
|
||||
self.element_accumulator = element_accumulator
|
||||
self.element_reduction = element_reduction
|
||||
self.element_reduction_accumulator = element_reduction_accumulator
|
||||
self.factor = factor
|
||||
|
||||
def get_epilogue_node(self, visitors):
|
||||
self.epilogue_node = RowReductionOp(
|
||||
self.element_accumulator, self.element_reduction,
|
||||
self.element_reduction_accumulator, *visitors)
|
||||
|
||||
def get_batch_stride(self, problem_size):
|
||||
return problem_size[0] * ((problem_size[1] + self.factor - 1) // self.factor)
|
||||
|
||||
def get_argument(self, visitor_args, kwargs):
|
||||
self.argument = self.epilogue_node.argument_type(kwargs[self.id + "_ptr"], *visitor_args, self.get_batch_stride(kwargs["problem_size"]))
|
||||
|
||||
class ColumnReductionNode:
|
||||
# Concept: ColumnReductionOp
|
||||
def __init__(self, element_accumulator, element_reduction,
|
||||
element_reduction_accumulator, id, factor) -> None:
|
||||
#
|
||||
self.id = id
|
||||
self.tag = "ColumnReduction:" + self.id
|
||||
self.type = "tensor"
|
||||
self.element_accumulator = element_accumulator
|
||||
self.element_reduction = element_reduction
|
||||
self.element_reduction_accumulator = element_reduction_accumulator
|
||||
self.factor = factor
|
||||
|
||||
def get_epilogue_node(self, visitors):
|
||||
self.epilogue_node = ColumnReductionOp(
|
||||
self.element_accumulator, self.element_reduction,
|
||||
self.element_reduction_accumulator, *visitors)
|
||||
|
||||
def get_batch_stride(self, problem_size):
|
||||
return problem_size[1] * ((problem_size[0] + self.factor - 1) // self.factor)
|
||||
|
||||
def get_argument(self, visitor_args, kwargs):
|
||||
self.argument = self.epilogue_node.argument_type(kwargs[self.id + '_ptr'], *visitor_args, self.get_batch_stride(kwargs["problem_size"]))
|
||||
|
||||
################################################################################
|
||||
# Epilogue parser function
|
||||
################################################################################
|
||||
class EpilogueAST(ast.NodeVisitor):
|
||||
def __init__(self, epilogue,
|
||||
tile_description,
|
||||
element_accumulator, elements_per_access,
|
||||
element_compute, element_output) -> None:
|
||||
#
|
||||
|
||||
self.tile_description = tile_description
|
||||
self.element_accumulator = element_accumulator
|
||||
self.elements_per_access = elements_per_access
|
||||
self.element_compute = element_compute
|
||||
self.element_output = element_output
|
||||
self.epilogue = epilogue
|
||||
|
||||
self.source = textwrap.dedent(inspect.getsource(epilogue.__call__))
|
||||
self.ast_tree = ast.parse(self.source)
|
||||
self.epilogue_tree = Tree()
|
||||
|
||||
|
||||
# print(ast.dump(self.ast_tree, indent=4)) # For Debug purpose
|
||||
|
||||
# input arguments
|
||||
self.input_args = {}
|
||||
# return nodes
|
||||
self.returns = []
|
||||
# reduction source nodes
|
||||
self.reduction_source = {}
|
||||
|
||||
# stack used to keep the parent node id
|
||||
self.stack = []
|
||||
|
||||
# visit the AST
|
||||
self.visit(self.ast_tree)
|
||||
|
||||
# visit the name node
|
||||
def visit_Name(self, node):
|
||||
# append the return ids into self.returns
|
||||
if self.stack[-1] == "return":
|
||||
self.returns.append(node.id)
|
||||
else:
|
||||
# accum is produced from accumulator node
|
||||
if node.id == "accum":
|
||||
name_node = AccumulatorNode(
|
||||
self.element_accumulator, self.elements_per_access, node)
|
||||
else:
|
||||
# for input nodes
|
||||
if node.id in self.input_args.keys():
|
||||
type = self.input_args[node.id][0]
|
||||
if type == "tensor":
|
||||
name_node = TensorInputNode(self.element_accumulator, node)
|
||||
elif type == "row":
|
||||
name_node = RowBroadcastNode(self.element_accumulator, self.element_compute, node)
|
||||
elif type == "column":
|
||||
name_node = ColumnBroadcastNode(self.element_accumulator, self.element_compute, node)
|
||||
elif type == "scalar":
|
||||
name_node = ScalarInputNode(node)
|
||||
else:
|
||||
raise ValueError(type)
|
||||
# for output nodes
|
||||
else:
|
||||
name_node = TensorOutputNode(self.element_accumulator, node)
|
||||
self.epilogue_tree.create_node(name_node.tag, name_node.id, data=name_node, parent=self.stack[-1])
|
||||
|
||||
def visit_Assign(self, node):
|
||||
pre_assign_node = self.epilogue_tree.get_node(node.targets[0].id)
|
||||
if pre_assign_node is None:
|
||||
# The assign is to a root node
|
||||
# skip the reduction nodes
|
||||
if isinstance(node.value, ast.Call):
|
||||
if isinstance(node.value.func, ast.Name):
|
||||
func_type = node.value.func.id
|
||||
elif isinstance(node.value.func, ast.Attribute):
|
||||
func_type = node.value.func.value.id
|
||||
else:
|
||||
raise TypeError
|
||||
if func_type == 'reduction_op':
|
||||
self.reduction_source[node.value.args[0].id] = [node.value.args[1].value, node.value.args[2].value, node.targets[0].id]
|
||||
return
|
||||
name_node = TensorOutputNode(self.element_accumulator, node)
|
||||
self.epilogue_tree.create_node(name_node.tag, name_node.id, data=name_node)
|
||||
self.stack.append(name_node.id)
|
||||
else:
|
||||
if node.targets[0].id in self.returns or node.targets[0].id in self.reduction_source.keys():
|
||||
self.stack.append(node.targets[0].id)
|
||||
else:
|
||||
self.stack.append(pre_assign_node.predecessor(self.epilogue_tree.identifier))
|
||||
self.epilogue_tree.remove_node(node.targets[0].id)
|
||||
|
||||
# get child tag
|
||||
self.visit(node.value)
|
||||
self.stack.pop()
|
||||
|
||||
def visit_Call(self, node):
|
||||
if isinstance(node.func, ast.Name):
|
||||
func_type = node.func.id
|
||||
elif isinstance(node.func, ast.Attribute):
|
||||
func_type = node.func.value.id
|
||||
else:
|
||||
raise TypeError
|
||||
if func_type == "reduction_op":
|
||||
self.visit(node.args[0])
|
||||
else:
|
||||
arg_list = []
|
||||
for idx, arg in enumerate(node.args):
|
||||
if idx == 0: continue
|
||||
if isinstance(arg, ast.Constant):
|
||||
arg_list.append(arg.value)
|
||||
elif isinstance(arg, ast.Name):
|
||||
arg_list.append(arg.id)
|
||||
else:
|
||||
raise TypeError
|
||||
|
||||
unary_node = UnaryNode(self.element_accumulator, self.element_compute, self.elements_per_access, node, arg_list)
|
||||
self.epilogue_tree.create_node(unary_node.tag, unary_node.id, parent=self.stack[-1], data=unary_node)
|
||||
self.stack.append(unary_node.id)
|
||||
self.visit(node.args[0])
|
||||
self.stack.pop()
|
||||
|
||||
def visit_BinOp(self, node):
|
||||
binop = BinOpNode(self.element_accumulator, self.element_compute,
|
||||
self.elements_per_access, node)
|
||||
self.epilogue_tree.create_node(binop.tag, binop.id, data=binop, parent=self.stack[-1])
|
||||
self.stack.append(binop.id)
|
||||
self.visit(node.left)
|
||||
self.visit(node.right)
|
||||
self.stack.pop()
|
||||
|
||||
def visit_Return(self, node):
|
||||
self.stack.append("return")
|
||||
self.visit(node.value)
|
||||
self.stack.pop()
|
||||
|
||||
# # A function definition
|
||||
def visit_FunctionDef(self, node: ast.FunctionDef):
|
||||
# visit args
|
||||
for arg in node.args.args:
|
||||
if arg.arg == "self": continue
|
||||
if isinstance(arg.annotation, ast.Constant):
|
||||
self.input_args[arg.arg] = [arg.annotation.value, ]
|
||||
# visit the assign in the reverse order
|
||||
for idx in range(len(node.body)):
|
||||
self.visit(node.body[-1-idx])
|
||||
|
||||
#
|
||||
# Tree optimization pass
|
||||
#
|
||||
|
||||
# pass 1: lower Binary to Unary
|
||||
def pass_binary_2_unary(self, tree, nid):
|
||||
node = tree.get_node(nid)
|
||||
if isinstance(node.data, BinOpNode):
|
||||
lhs_node = tree.get_node(node.successors(tree.identifier)[0])
|
||||
left_type = lhs_node.data.type
|
||||
rhs_node = tree.get_node(node.successors(tree.identifier)[1])
|
||||
right_type = rhs_node.data.type
|
||||
|
||||
if left_type == "scalar" and right_type == "tensor":
|
||||
node.data = UnaryNode(
|
||||
self.element_accumulator, self.element_compute,
|
||||
self.elements_per_access,
|
||||
node.data, [lhs_node.data.id,])
|
||||
node.tag = node.data.tag
|
||||
tree.remove_node(lhs_node.data.id)
|
||||
self.pass_binary_2_unary(tree, rhs_node.data.id)
|
||||
|
||||
elif left_type == "tensor" and right_type == "scalar":
|
||||
node.data = UnaryNode(
|
||||
self.element_accumulator, self.element_compute,
|
||||
self.elements_per_access,
|
||||
node.data, [rhs_node.id,])
|
||||
node.tag = node.data.tag
|
||||
tree.remove_node(rhs_node.data.id)
|
||||
self.pass_binary_2_unary(tree, lhs_node.data.id)
|
||||
|
||||
else:
|
||||
self.pass_binary_2_unary(tree, lhs_node.data.id)
|
||||
self.pass_binary_2_unary(tree, rhs_node.data.id)
|
||||
else:
|
||||
for child in node.successors(tree.identifier):
|
||||
self.pass_binary_2_unary(tree, child)
|
||||
|
||||
# pass 2: inject reduction nodes
|
||||
def pass_inject_reduction(self, tree, nid):
|
||||
node = tree.get_node(nid)
|
||||
if isinstance(node.data, TensorOutputNode):
|
||||
if node.data.id in self.reduction_source.keys():
|
||||
direction = self.reduction_source[node.data.id][0]
|
||||
target = self.reduction_source[node.data.id][-1]
|
||||
if direction == 'row':
|
||||
reduction_node = RowReductionNode(
|
||||
self.element_accumulator, self.element_output,
|
||||
self.element_accumulator, target, self.tile_description.threadblock_shape[1])
|
||||
elif direction == "column":
|
||||
reduction_node = ColumnReductionNode(
|
||||
self.element_accumulator, self.element_output,
|
||||
self.element_accumulator, target, self.tile_description.threadblock_shape[0])
|
||||
else:
|
||||
raise ValueError(direction)
|
||||
child_nid = node.successors(tree.identifier)[0]
|
||||
# if this output node is injected only for reduction
|
||||
if node.data.id not in self.returns:
|
||||
# get reduction config from disc
|
||||
node.data = reduction_node
|
||||
node.tag = reduction_node.tag
|
||||
self.pass_inject_reduction(tree, child_nid)
|
||||
# if this output node is also a tensor output, inject reduction as its children
|
||||
else:
|
||||
# get child node
|
||||
tree.create_node(reduction_node.tag, reduction_node.id, data=reduction_node, parent=node.data.id)
|
||||
tree.move_node(child_nid, reduction_node.id)
|
||||
child = tree.get_node(child_nid)
|
||||
for grand_child in child.successors(tree.identifier):
|
||||
self.pass_inject_reduction(tree, grand_child)
|
||||
else:
|
||||
for child in node.successors(tree.identifier):
|
||||
self.pass_inject_reduction(tree, child)
|
||||
else:
|
||||
for child in node.successors(tree.identifier):
|
||||
self.pass_inject_reduction(tree, child)
|
||||
|
||||
def pass_inject_epilogue_op(self, tree, nid):
|
||||
node = tree.get_node(nid)
|
||||
visitors = []
|
||||
for child in node.successors(tree.identifier):
|
||||
visitors.append(self.pass_inject_epilogue_op(tree, child))
|
||||
|
||||
node.data.get_epilogue_node(visitors)
|
||||
return node.data.epilogue_node
|
||||
|
||||
def get_arguments(self, tree, nid, kwargs):
|
||||
node = tree.get_node(nid)
|
||||
visitor_args = []
|
||||
for child in node.successors(tree.identifier):
|
||||
visitor_args.append(self.get_arguments(tree, child, kwargs))
|
||||
|
||||
node.data.get_argument(visitor_args, kwargs)
|
||||
return node.data.argument
|
||||
|
||||
class EpilogueVisitTree:
|
||||
KernelTemplate = """
|
||||
${visitor}
|
||||
|
||||
using ${operation_name}_EpilogueVisitor = cutlass::epilogue::threadblock::EpilogueVisitorGeneric<${visitor_name}>;
|
||||
"""
|
||||
def __init__(self, elementwise_functor, tile_description,
|
||||
element_accumulator, elements_per_access,
|
||||
element_compute, element_output) -> None:
|
||||
#
|
||||
# data types
|
||||
self.tile_description = tile_description
|
||||
self.element_accumulator = element_accumulator
|
||||
self.elements_per_access = elements_per_access
|
||||
self.element_compute = element_compute
|
||||
self.element_output = element_output
|
||||
self.elementwise_functor = elementwise_functor
|
||||
pass
|
||||
|
||||
def initialize(self):
|
||||
function = EpilogueAST(self, self.tile_description,
|
||||
self.element_accumulator, self.elements_per_access,
|
||||
self.element_compute, self.element_output)
|
||||
#
|
||||
tree = function.epilogue_tree
|
||||
self.tree = tree
|
||||
function.pass_binary_2_unary(self.tree, self.tree.root)
|
||||
function.pass_inject_reduction(self.tree, self.tree.root)
|
||||
function.pass_inject_epilogue_op(self.tree,self.tree.root)
|
||||
|
||||
visitor = self.tree.get_node(self.tree.root).data.epilogue_node
|
||||
self.visitor = visitor
|
||||
|
||||
class _Argument(ctypes.Structure):
|
||||
_fields_ = [
|
||||
("visitor_arg", visitor.argument_type)
|
||||
]
|
||||
def __init__(self, **kwargs) -> None:
|
||||
# process input args
|
||||
_kwargs = {}
|
||||
for input_key in function.input_args.keys():
|
||||
if input_key == "accum":
|
||||
continue
|
||||
if function.input_args[input_key][0] == "scalar":
|
||||
continue
|
||||
# tensor input
|
||||
else:
|
||||
setattr(self, "buffer_tensor_" + input_key, NumpyFrontend.argument(kwargs[input_key], False))
|
||||
setattr(self, input_key + "_ptr", int(getattr(self, "buffer_tensor_" + input_key).ptr))
|
||||
_kwargs[input_key+"_ptr"] = getattr(self, input_key + "_ptr")
|
||||
# process the return args
|
||||
for ret in function.returns:
|
||||
setattr(self, "buffer_tensor_" + ret, NumpyFrontend.argument(kwargs[ret], True))
|
||||
setattr(self, ret + "_ptr", int(getattr(self, "buffer_tensor_" + ret).ptr))
|
||||
_kwargs[ret+"_ptr"] = getattr(self, ret + "_ptr")
|
||||
setattr(self, "host_tensor_" + ret, kwargs[ret])
|
||||
|
||||
_kwargs.update(kwargs)
|
||||
function.get_arguments(tree, tree.root, _kwargs)
|
||||
self.visitor_arg = tree.get_node(tree.root).data.argument
|
||||
|
||||
def sync(self, stream_sync=True):
|
||||
if stream_sync:
|
||||
err, = cudart.cudaDeviceSynchronize()
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError("CUDA Error %s" % str(err))
|
||||
|
||||
for ret in function.returns:
|
||||
err, = cuda.cuMemcpyDtoH(
|
||||
getattr(self, "host_tensor_" + ret), cuda.CUdeviceptr(getattr(self, ret + "_ptr")),
|
||||
getattr(self, "host_tensor_" + ret).size * getattr(self, "host_tensor_" + ret).itemsize
|
||||
)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError("CUDA Error %s" % str(err))
|
||||
pass
|
||||
|
||||
self.epilogue_type = _Argument
|
||||
|
||||
def emit(self, operation):
|
||||
values = {
|
||||
'visitor': self.visitor.emit(operation),
|
||||
'operation_name': operation.procedural_name(),
|
||||
'visitor_name': self.visitor.instance_name
|
||||
}
|
||||
return SubstituteTemplate(self.KernelTemplate, values)
|
||||
@ -1,398 +0,0 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
from pycutlass import *
|
||||
from pycutlass.c_types import get_reduction_params
|
||||
import cutlass
|
||||
from cuda import cuda
|
||||
try:
|
||||
import torch
|
||||
torch_available = True
|
||||
except ImportError:
|
||||
torch_available = False
|
||||
import numpy as np
|
||||
from typing import Union
|
||||
from cuda import cudart
|
||||
|
||||
|
||||
class ReductionOperation:
|
||||
pass
|
||||
|
||||
|
||||
class ReductionArguments:
|
||||
"""
|
||||
Arguments of reduction
|
||||
"""
|
||||
|
||||
def __init__(self, operation: ReductionOperation,
|
||||
problem_size: 'list[int]', partitions: int,
|
||||
workspace: cuda.CUdeviceptr,
|
||||
destination: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]',
|
||||
source: 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]', **kwargs) -> None:
|
||||
|
||||
# tensor_C can be interpreted as the bias with bias=True in keyword args
|
||||
if "bias" in kwargs.keys():
|
||||
self.bias = kwargs["bias"]
|
||||
else:
|
||||
# by default, tensor_C is not bias
|
||||
self.bias = False
|
||||
|
||||
self.operation = operation
|
||||
#: pointer to the workspace
|
||||
self.ptr_workspace = workspace
|
||||
|
||||
#: number of split-k partitions
|
||||
self.partitions = partitions
|
||||
|
||||
if isinstance(destination, np.ndarray):
|
||||
self.host_D = destination
|
||||
self.destination_buffer = NumpyFrontend.argument(destination, True)
|
||||
self.source_buffer = NumpyFrontend.argument(source, False)
|
||||
self.ptr_destination = cuda.CUdeviceptr(
|
||||
self.destination_buffer.ptr)
|
||||
self.ptr_source = cuda.CUdeviceptr(self.source_buffer.ptr)
|
||||
elif torch_available and isinstance(destination, torch.Tensor):
|
||||
self.ptr_destination = TorchFrontend.argument(destination)
|
||||
self.ptr_source = TorchFrontend.argument(source)
|
||||
elif isinstance(destination, cuda.CUdeviceptr):
|
||||
self.ptr_destination = destination
|
||||
self.ptr_source = source
|
||||
else:
|
||||
raise TypeError("unknown Type")
|
||||
|
||||
self.problem_size = MatrixCoord_(
|
||||
problem_size[0], problem_size[1]
|
||||
)
|
||||
|
||||
self.partition_stride = problem_size[0] * \
|
||||
problem_size[1] * DataTypeSize[operation.C.element] // 8
|
||||
|
||||
if "output_op" in kwargs.keys():
|
||||
self.output_op = kwargs['output_op']
|
||||
else:
|
||||
self.output_op = self.operation.epilogue_type(1.0, 0.0)
|
||||
|
||||
# get arguments
|
||||
self.get_arguments()
|
||||
|
||||
@staticmethod
|
||||
def get_tensor_ref(extent: 'tuple[int]', device_ptr: cuda.CUdeviceptr, layout: cutlass.layout):
|
||||
if layout == cutlass.RowMajor:
|
||||
return TensorRef2D_(int(device_ptr), extent[1])
|
||||
else:
|
||||
raise ValueError("unknown layout type")
|
||||
|
||||
def get_arguments(self):
|
||||
ref_workspace = ReductionArguments.get_tensor_ref(
|
||||
extent=[self.problem_size.row, self.problem_size.column],
|
||||
device_ptr=self.ptr_workspace, layout=cutlass.RowMajor)
|
||||
if self.bias:
|
||||
ref_source = ReductionArguments.get_tensor_ref(
|
||||
extent=[0, 0],
|
||||
device_ptr=self.ptr_source, layout=cutlass.RowMajor)
|
||||
else:
|
||||
ref_source = ReductionArguments.get_tensor_ref(
|
||||
extent=[self.problem_size.row, self.problem_size.column],
|
||||
device_ptr=self.ptr_source, layout=cutlass.RowMajor)
|
||||
|
||||
ref_destination = ReductionArguments.get_tensor_ref(
|
||||
extent=[self.problem_size.row, self.problem_size.column],
|
||||
device_ptr=self.ptr_destination, layout=cutlass.RowMajor)
|
||||
|
||||
|
||||
self.c_arguments = self.operation.argument_type(
|
||||
self.problem_size, self.partitions,
|
||||
self.partition_stride, ref_workspace,
|
||||
ref_destination, ref_source,
|
||||
self.output_op
|
||||
)
|
||||
|
||||
params_ = self.operation.rt_module.get_args(
|
||||
ctypes.byref(self.c_arguments))
|
||||
self.host_workspace = bytearray(params_.contents)
|
||||
|
||||
def sync(self):
|
||||
err, = cudart.cudaDeviceSynchronize()
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError("CUDA Error %s" % str(err))
|
||||
|
||||
if hasattr(self, "host_D"):
|
||||
err, = cuda.cuMemcpyDtoH(
|
||||
self.host_D, self.ptr_destination, self.host_D.size * self.host_D.itemsize)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError("CUDA Error %s" % str(err))
|
||||
|
||||
def free(self):
|
||||
if hasattr(self, "destination_buffer"):
|
||||
del self.destination_buffer
|
||||
if hasattr(self, "source_buffer"):
|
||||
del self.source_buffer
|
||||
|
||||
|
||||
class ReductionRT(ExecutableOperation):
|
||||
"""
|
||||
ReductionRT manages the CUTLASS runtime components for reduction
|
||||
"""
|
||||
KernelTemplate = r'''
|
||||
extern "C"
|
||||
__global__ void
|
||||
${operation_name}(${operation_name}${operation_suffix}::Params params) {
|
||||
|
||||
// Dynamic shared memory base pointer
|
||||
extern __shared__ int SharedStorageBase[];
|
||||
|
||||
// Declare pointer to dynamic shared memory.
|
||||
${operation_name}${operation_suffix}::SharedStorage *shared_storage =
|
||||
reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
|
||||
|
||||
${operation_name}${operation_suffix} op;
|
||||
|
||||
op(params, *shared_storage);
|
||||
}
|
||||
'''
|
||||
HostTemplate = r'''
|
||||
extern "C" {
|
||||
// Get the size of params in bytes
|
||||
int ${operation_name}_get_param_size(){
|
||||
return sizeof(${operation_name}${operation_suffix}::Params);
|
||||
}
|
||||
|
||||
// Get the size of dynamic shared memory in bytes
|
||||
int ${operation_name}_shared_memory_size() {
|
||||
return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
|
||||
}
|
||||
|
||||
// Get the params as byte array
|
||||
char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Params* params){
|
||||
char *bytes = ((char*)(params));
|
||||
char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
|
||||
for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
|
||||
output[i] = bytes[i];
|
||||
|
||||
return output;
|
||||
}
|
||||
}
|
||||
'''
|
||||
|
||||
def __init__(self, operation: ReductionOperation):
|
||||
super().__init__(operation)
|
||||
|
||||
self.operation: ReductionOperation = operation
|
||||
self.emitter = EmitReductionInstance('_type')
|
||||
|
||||
self.elements_per_access = self.operation.count
|
||||
self.argument_type, self.epilogue_type = get_reduction_params(operation.epilogue_functor)
|
||||
self.argtype = [ctypes.POINTER(self.argument_type)]
|
||||
|
||||
def emit(self):
|
||||
return self.emitter.emit(self.operation)
|
||||
|
||||
def plan(self, arguments: ReductionArguments):
|
||||
block_shape = [self.operation.shape.column(
|
||||
) // self.elements_per_access, self.operation.shape.row(), 1]
|
||||
grid_shape = [
|
||||
(arguments.problem_size.row + self.operation.shape.row() -
|
||||
1) // self.operation.shape.row(),
|
||||
(arguments.problem_size.column + self.operation.shape.column() -
|
||||
1) // self.operation.shape.column(),
|
||||
1
|
||||
]
|
||||
return LaunchConfiguration(grid_shape, block_shape, self.shared_memory_capacity)
|
||||
|
||||
def initialize(self):
|
||||
err, = cuda.cuFuncSetAttribute(
|
||||
self.kernel,
|
||||
attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
|
||||
value=self.shared_memory_capacity)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError('Cuda Error: {}'.format(err))
|
||||
|
||||
|
||||
class ReductionOperation:
|
||||
"""
|
||||
CUTLASS Reduction Operation
|
||||
shape: shape of CTA
|
||||
outputop: output operator
|
||||
r
|
||||
"""
|
||||
|
||||
def __init__(self, shape: cutlass.MatrixCoord, C: TensorDescription,
|
||||
element_accumulator, element_workspace=None,
|
||||
element_compute=None, epilogue_functor=None,
|
||||
count: int = 1, partitions_per_stage: int = 4) -> None:
|
||||
""" Constructor
|
||||
"""
|
||||
|
||||
self.shape = shape
|
||||
#: epilogue functor (default: LinearCombination)
|
||||
self.epilogue_functor = epilogue_functor
|
||||
#: datatype of accumulator
|
||||
self.element_accumulator = element_accumulator
|
||||
|
||||
if element_workspace is None:
|
||||
#: datatype of workspace
|
||||
self.element_workspace = element_accumulator
|
||||
else:
|
||||
#: datatype of workspace
|
||||
self.element_workspace = element_workspace
|
||||
|
||||
if element_compute is None:
|
||||
#: datatype of workspace
|
||||
self.element_compute = element_accumulator
|
||||
else:
|
||||
#: datatype of workspace
|
||||
self.element_compute = element_compute
|
||||
|
||||
#: datatype of output
|
||||
self.element_output = C.element
|
||||
|
||||
#: operand C
|
||||
self.C: TensorDescription = C
|
||||
|
||||
#: reduce op processing size
|
||||
self.count: int = count
|
||||
|
||||
#: number of partitions to reduce per stage
|
||||
self.partitions_per_stage: int = partitions_per_stage
|
||||
|
||||
self.rt_module: ReductionRT = ReductionRT(self)
|
||||
self.argument_type = self.rt_module.argument_type
|
||||
self.epilogue_type = self.rt_module.epilogue_type
|
||||
|
||||
#
|
||||
def extended_name(self):
|
||||
extend_name = "${element_workspace}_${element_accumulator}_${element_compute}_${element_output}"
|
||||
|
||||
return SubstituteTemplate(extend_name,
|
||||
{
|
||||
'element_workspace': DataTypeNames[self.element_workspace],
|
||||
'element_accumulator': DataTypeNames[self.element_accumulator],
|
||||
'element_compute': DataTypeNames[self.element_compute],
|
||||
'element_output': DataTypeNames[self.element_output]
|
||||
})
|
||||
|
||||
#
|
||||
def configuration_name(self):
|
||||
''' The full procedural name indicates architecture, extended name, tile size'''
|
||||
|
||||
configuration_name = "cutlass_reduce_split_k_${extended_name}_${threadblock}"
|
||||
|
||||
threadblock = "%dx%d" % (
|
||||
self.shape.row(),
|
||||
self.shape.column()
|
||||
)
|
||||
|
||||
return SubstituteTemplate(
|
||||
configuration_name,
|
||||
{
|
||||
'extended_name': self.extended_name(),
|
||||
'threadblock': threadblock
|
||||
}
|
||||
)
|
||||
|
||||
#
|
||||
def procedural_name(self):
|
||||
''' The full procedural name indicates architecture, extended name, tile size'''
|
||||
return self.configuration_name()
|
||||
|
||||
def run(self, arguments: ReductionArguments) -> cuda.CUresult:
|
||||
"""
|
||||
Configure and launch the cuda kernel with input arguments
|
||||
"""
|
||||
# get launch configuration
|
||||
launch_config = self.rt_module.plan(arguments)
|
||||
|
||||
# get the host and device workspace
|
||||
host_workspace = arguments.host_workspace
|
||||
device_workspace = None
|
||||
|
||||
# launch the kernel
|
||||
err = self.rt_module.run(
|
||||
host_workspace, device_workspace, launch_config)
|
||||
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError('CUDA Error %s' % str(err))
|
||||
|
||||
return err
|
||||
|
||||
|
||||
class EmitReductionInstance:
|
||||
def __init__(self, operation_suffix='') -> None:
|
||||
self.operation_suffix = operation_suffix
|
||||
self.includes = [
|
||||
"cutlass/cutlass.h",
|
||||
"cutlass/numeric_types.h",
|
||||
"cutlass/arch/arch.h",
|
||||
"cutlass/arch/mma.h",
|
||||
"cutlass/layout/matrix.h",
|
||||
"cutlass/gemm/device/gemm.h",
|
||||
"cutlass/gemm/device/gemm_universal_adapter.h",
|
||||
"cutlass/gemm/kernel/default_gemm_universal.h",
|
||||
"cutlass/reduction/kernel/reduce_split_k.h",
|
||||
"cutlass/reduction/thread/reduction_operators.h"
|
||||
]
|
||||
self.template = """
|
||||
// Reduction kernel instance
|
||||
using ${operation_name}_base =
|
||||
typename cutlass::reduction::kernel::ReduceSplitK<
|
||||
cutlass::MatrixShape<${shape_row}, ${shape_column}>,
|
||||
${epilogue_functor},
|
||||
cutlass::reduction::thread::ReduceAdd<
|
||||
${element_accumulator},
|
||||
${element_output},
|
||||
${count}>,
|
||||
${partition_per_stage}>;
|
||||
|
||||
struct ${operation_name}${operation_suffix}:
|
||||
public ${operation_name}_base { };
|
||||
"""
|
||||
|
||||
def emit(self, operation: ReductionOperation):
|
||||
|
||||
epilogue_vector_length = int(min(
|
||||
operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
|
||||
|
||||
values = {
|
||||
'operation_name': operation.configuration_name(),
|
||||
'operation_suffix': self.operation_suffix,
|
||||
'shape_row': str(operation.shape.row()),
|
||||
'shape_column': str(operation.shape.column()),
|
||||
'epilogue_functor': operation.epilogue_functor.emit(),
|
||||
'element_output': DataTypeTag[operation.element_output],
|
||||
'epilogue_vector_length': str(epilogue_vector_length),
|
||||
'element_accumulator': DataTypeTag[operation.element_accumulator],
|
||||
'element_compute': DataTypeTag[operation.element_compute],
|
||||
'element_workspace': DataTypeTag[operation.element_workspace],
|
||||
'count': str(operation.count),
|
||||
'partition_per_stage': str(operation.partitions_per_stage)
|
||||
}
|
||||
|
||||
return SubstituteTemplate(self.template, values)
|
||||
@ -1,70 +0,0 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
from typeguard import typechecked
|
||||
import numpy as np
|
||||
try:
|
||||
import torch
|
||||
torch_available = True
|
||||
except ImportError:
|
||||
torch_available = False
|
||||
from cuda import cuda
|
||||
try:
|
||||
import cupy as cp
|
||||
cupy_available = True
|
||||
except ImportError:
|
||||
cupy_available = False
|
||||
import cutlass
|
||||
|
||||
|
||||
# @typechecked
|
||||
class TensorRef:
|
||||
"""
|
||||
Python Wrapper for cutlass.TensorRef
|
||||
"""
|
||||
def __init__(self, tensor, dtype, layout) -> None:
|
||||
if isinstance(tensor, np.ndarray):
|
||||
ptr = cuda.CUdeviceptr(tensor.__array_interface__['data'][0])
|
||||
elif torch_available and isinstance(tensor, torch.Tensor):
|
||||
ptr = cuda.CUdeviceptr(tensor.data_ptr())
|
||||
elif cupy_available and isinstance(tensor, cp.ndarray):
|
||||
ptr = cuda.CUdeviceptr(int(tensor.data.ptr))
|
||||
elif isinstance(tensor, cuda.CUdeviceptr):
|
||||
ptr = tensor
|
||||
elif isinstance(tensor, int):
|
||||
ptr = cuda.CUdeviceptr(tensor)
|
||||
else:
|
||||
raise NotImplementedError(tensor)
|
||||
|
||||
# the dtype(0) is used to overload between different data types
|
||||
# with the same layout
|
||||
self.tensor_ref = cutlass.get_tensor_ref(int(ptr), dtype(0), layout)
|
||||
@ -1,4 +0,0 @@
|
||||
from pycutlass.test.profiler import *
|
||||
from pycutlass.test.conv2d_testbed import *
|
||||
from pycutlass.test.gemm_testbed import *
|
||||
from pycutlass.test.gemm_grouped_testbed import *
|
||||
@ -1,632 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import pycutlass
|
||||
from pycutlass import *
|
||||
from pycutlass.test import *
|
||||
from time import sleep
|
||||
from bfloat16 import bfloat16
|
||||
import subprocess
|
||||
from typeguard import typechecked
|
||||
import re
|
||||
|
||||
|
||||
|
||||
def getTensorRef(tensor, tensor_layout, conv_kind, problem_size, operand):
|
||||
ptr = tensor.__array_interface__['data'][0]
|
||||
if operand == "a":
|
||||
tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(conv_kind, problem_size)
|
||||
elif operand == "b":
|
||||
tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(conv_kind, problem_size)
|
||||
elif operand in ["c", "d"]:
|
||||
tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(conv_kind, problem_size)
|
||||
else:
|
||||
raise ValueError("unknown operand: " + operand)
|
||||
|
||||
layout = tensor_layout.packed(tensor_coord)
|
||||
|
||||
if tensor.dtype == np.float64:
|
||||
return cutlass.TensorRefF64NHWC(ptr, layout)
|
||||
elif tensor.dtype == np.float32:
|
||||
return cutlass.TensorRefF32NHWC(ptr, layout)
|
||||
elif tensor.dtype == np.float16:
|
||||
return cutlass.TensorRefF16NHWC(ptr, layout)
|
||||
if tensor.dtype == bfloat16:
|
||||
return cutlass.TensorRefBF16NHWC(ptr, layout)
|
||||
elif tensor.dtype == np.int32:
|
||||
return cutlass.TensorRefS32NHWC(ptr, layout)
|
||||
elif tensor.dtype == np.int8:
|
||||
if tensor_layout == cutlass.TensorNC32HW32:
|
||||
return cutlass.TensorRefS8NC32HW32(ptr, layout)
|
||||
elif tensor_layout == cutlass.TensorC32RSK32:
|
||||
return cutlass.TensorRefS8C32RSK32(ptr, layout)
|
||||
else:
|
||||
return cutlass.TensorRefS8NHWC(ptr, layout)
|
||||
else:
|
||||
raise ValueError("unsupported data type")
|
||||
|
||||
def getTensorView(tensor, tensor_layout, conv_kind, problem_size, operand):
|
||||
tensor_ref = getTensorRef(tensor, tensor_layout, conv_kind, problem_size, operand)
|
||||
|
||||
if operand == "a":
|
||||
tensor_coord = cutlass.conv.implicit_gemm_tensor_a_extent(conv_kind, problem_size)
|
||||
elif operand == "b":
|
||||
tensor_coord = cutlass.conv.implicit_gemm_tensor_b_extent(conv_kind, problem_size)
|
||||
elif operand in ["c", "d"]:
|
||||
tensor_coord = cutlass.conv.implicit_gemm_tensor_c_extent(conv_kind, problem_size)
|
||||
else:
|
||||
raise ValueError("unknown operand: " + operand)
|
||||
|
||||
if tensor.dtype == np.float64:
|
||||
return cutlass.TensorViewF64NHWC(tensor_ref, tensor_coord)
|
||||
elif tensor.dtype == np.float32:
|
||||
return cutlass.TensorViewF32NHWC(tensor_ref, tensor_coord)
|
||||
elif tensor.dtype == np.float16:
|
||||
return cutlass.TensorViewF16NHWC(tensor_ref, tensor_coord)
|
||||
elif tensor.dtype == bfloat16:
|
||||
return cutlass.TensorViewBF16NHWC(tensor_ref, tensor_coord)
|
||||
elif tensor.dtype == np.int32:
|
||||
return cutlass.TensorViewS32NHWC(tensor_ref, tensor_coord)
|
||||
elif tensor.dtype == np.int8:
|
||||
if tensor_layout == cutlass.TensorNC32HW32:
|
||||
return cutlass.TensorViewS8NC32HW32(tensor_ref, tensor_coord)
|
||||
elif tensor_layout == cutlass.TensorC32RSK32:
|
||||
return cutlass.TensorViewS8C32RSK32(tensor_ref, tensor_coord)
|
||||
else:
|
||||
return cutlass.TensorViewS8NHWC(tensor_ref, tensor_coord)
|
||||
|
||||
else:
|
||||
raise ValueError("unsupported data type")
|
||||
|
||||
|
||||
|
||||
# @typechecked
|
||||
class Conv2dLauncher:
|
||||
"""
|
||||
Launcher that runs the operation on given problem size
|
||||
"""
|
||||
def __init__(self, operation: 'Conv2dOperation', seed: int=2080, interleaved=False,
|
||||
verification=True, profiling=False, warmup_iterations=500, iterations=500, **kwargs) -> None:
|
||||
|
||||
self.enable_cached_results = True
|
||||
self.interleaved = interleaved
|
||||
|
||||
# create the reduction kernel
|
||||
self.reduction_operation = ReductionOperation(
|
||||
shape=cutlass.MatrixCoord(4, 32 * operation.C.alignment),
|
||||
C=operation.C, element_accumulator=operation.tile_description.math_instruction.element_accumulator,
|
||||
element_compute=operation.epilogue_functor.element_epilogue, epilogue_functor=operation.epilogue_functor,
|
||||
count=operation.C.alignment
|
||||
)
|
||||
|
||||
#: verify the output result
|
||||
self.verification = verification
|
||||
#: profile the kernel's runtime
|
||||
self.profiling = profiling
|
||||
|
||||
self.timer = GpuTimer()
|
||||
|
||||
self.warmup_iterations = warmup_iterations
|
||||
self.iterations = iterations
|
||||
|
||||
if "sleep" in kwargs.keys():
|
||||
self.sleep_time = kwargs["sleep"]
|
||||
else:
|
||||
self.sleep_time = 0
|
||||
|
||||
#
|
||||
# Compile the operator
|
||||
#
|
||||
|
||||
pycutlass.compiler.add_module([operation, self.reduction_operation])
|
||||
|
||||
self.operation = operation
|
||||
|
||||
self.dtype_A = Conv2dLauncher.numpy_type(operation.A.element)
|
||||
self.layout_A = operation.A.layout
|
||||
self.dtype_B = Conv2dLauncher.numpy_type(operation.B.element)
|
||||
self.layout_B = operation.B.layout
|
||||
self.dtype_C = Conv2dLauncher.numpy_type(operation.C.element)
|
||||
self.layout_C = operation.C.layout
|
||||
self.dtype_D = Conv2dLauncher.numpy_type(operation.C.element)
|
||||
self.layout_D = operation.C.layout
|
||||
|
||||
accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
|
||||
element_size = DataTypeSize[operation.A.element]
|
||||
|
||||
if element_size <= 8:
|
||||
self.scope = 1
|
||||
elif element_size == 16:
|
||||
if accumulator_size <= 16:
|
||||
self.scope = 2
|
||||
else:
|
||||
self.scope = 4
|
||||
else:
|
||||
self.scope = 7
|
||||
|
||||
# Seed
|
||||
self.seed = seed
|
||||
|
||||
self.conv_kind = operation.conv_kind
|
||||
|
||||
|
||||
#
|
||||
# Get the host reference function
|
||||
#
|
||||
|
||||
self.element_compute = operation.epilogue_functor.element_epilogue
|
||||
|
||||
self.host_conv2d = cutlass.test.conv.host.conv2d
|
||||
|
||||
self.timer = GpuTimer()
|
||||
|
||||
@staticmethod
|
||||
def numpy_type(type):
|
||||
if type == cutlass.float64:
|
||||
return np.float64
|
||||
elif type == cutlass.float32:
|
||||
return np.float32
|
||||
elif type == cutlass.float16:
|
||||
return np.float16
|
||||
elif type == cutlass.bfloat16:
|
||||
return bfloat16
|
||||
elif type == cutlass.int32:
|
||||
return np.int32
|
||||
elif type == cutlass.int8:
|
||||
return np.int8
|
||||
else:
|
||||
raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
|
||||
|
||||
def print_problem_size(self, p, split_k_mode=1):
|
||||
print("nhwc_%dx%dx%dx%d_krsc_%dx%dx%dx%d_padding_%dx%d_stride_%dx%d_dilation_%dx%d_splitkslices_%d_splitkmode_%d"
|
||||
% (p.N, p.H, p.W, p.C, p.K, p.R, p.S, p.C, p.pad_h,
|
||||
p.pad_w, p.stride_h, p.stride_w, p.dilation_h, p.dilation_w, p.split_k_slices, split_k_mode))
|
||||
|
||||
def uniform_init(self, size, dtype):
|
||||
if dtype in [np.float32, np.float16, bfloat16, np.float64]:
|
||||
return np.ceil(
|
||||
np.random.uniform(
|
||||
low=-self.scope - 0.5, high=self.scope - 0.5,
|
||||
size=size).astype(dtype)
|
||||
)
|
||||
else:
|
||||
return np.random.uniform(
|
||||
low=-self.scope - 1, high=self.scope + 1,
|
||||
size=size).astype(dtype)
|
||||
|
||||
def eq_gemm_size(self, problem_size):
|
||||
n = problem_size.N
|
||||
p = problem_size.P
|
||||
q = problem_size.Q
|
||||
k = problem_size.K
|
||||
r = problem_size.R
|
||||
s = problem_size.S
|
||||
c = problem_size.C
|
||||
h = problem_size.H
|
||||
w = problem_size.W
|
||||
if self.conv_kind == cutlass.conv.Operator.fprop:
|
||||
return cutlass.gemm.GemmCoord(n * p * q, k, r * s * c)
|
||||
elif self.conv_kind == cutlass.conv.Operator.dgrad:
|
||||
return cutlass.gemm.GemmCoord(n * h * w, c, k * r * s)
|
||||
else:
|
||||
return cutlass.gemm.GemmCoord(k, r * s * c, n * p * q)
|
||||
|
||||
def bytes(self, problem_size, alpha, beta):
|
||||
mnk = self.eq_gemm_size(problem_size)
|
||||
|
||||
bytes_ = \
|
||||
(DataTypeSize[self.operation.A.element] * mnk.m() // 8) * mnk.k() + \
|
||||
(DataTypeSize[self.operation.B.element] * mnk.n() // 8) * mnk.k() + \
|
||||
(DataTypeSize[self.operation.C.element] * mnk.m() // 8) * mnk.n()
|
||||
|
||||
if beta != 0:
|
||||
bytes_ += (DataTypeSize[self.operation.C.element] * mnk.m() // 8) * mnk.n()
|
||||
|
||||
return bytes_
|
||||
|
||||
def flops(self, problem_size):
|
||||
mnk = self.eq_gemm_size(problem_size)
|
||||
|
||||
flops_mainloop_ = mnk.m() * mnk.n() * mnk.k() * 2
|
||||
flops_epilogue_ = mnk.m() * mnk.n() * 2
|
||||
|
||||
# Adjust mainloop flop for dgrad stride
|
||||
if self.conv_kind == cutlass.conv.Operator.dgrad:
|
||||
flops_mainloop_ = flops_mainloop_ // (problem_size.stride_h * problem_size.stride_w)
|
||||
|
||||
flops_total_ = flops_mainloop_ + flops_epilogue_
|
||||
|
||||
return flops_total_
|
||||
|
||||
|
||||
|
||||
def host_reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
|
||||
if self.element_compute == cutlass.float16:
|
||||
alpha = cutlass.float16(alpha)
|
||||
beta = cutlass.float16(beta)
|
||||
elif self.element_compute == cutlass.int32:
|
||||
alpha = int(alpha)
|
||||
beta = int(beta)
|
||||
else:
|
||||
alpha = alpha
|
||||
beta = beta
|
||||
|
||||
# if cached result is loaded
|
||||
cached_result_loaded = False
|
||||
|
||||
if self.enable_cached_results:
|
||||
# get problem key
|
||||
cached_test_key = cutlass.test.conv.host.CreateCachedConv2dTestKey(
|
||||
self.conv_kind, problem_size, alpha, beta,
|
||||
getTensorView(tensor_A, self.layout_A, self.conv_kind, problem_size, "a"),
|
||||
getTensorView(tensor_B, self.layout_B, self.conv_kind, problem_size, "b"),
|
||||
getTensorView(tensor_C, self.layout_C, self.conv_kind, problem_size, "c"),
|
||||
)
|
||||
|
||||
cached_test_result = cutlass.test.conv.host.CachedTestResult()
|
||||
|
||||
conv2d_result_cache_name = "cached_results_SM%d_%d.txt" % (self.operation.arch, self.seed)
|
||||
|
||||
cached_results = cutlass.test.conv.host.CachedTestResultListing(conv2d_result_cache_name)
|
||||
# CachedTestResultListing cached_results(conv2d_result_cache_name);
|
||||
cached = cached_results.find(cached_test_key)
|
||||
cached_result_loaded = cached[0]
|
||||
if cached_result_loaded :
|
||||
cached_test_result = cached[1]
|
||||
|
||||
if not cached_result_loaded:
|
||||
# compute the conv2d on host
|
||||
tensor_D_ref = np.ones_like(tensor_C)
|
||||
tensor_ref_A = getTensorRef(tensor_A, self.layout_A, self.conv_kind, problem_size, "a")
|
||||
tensor_ref_B = getTensorRef(tensor_B, self.layout_B, self.conv_kind, problem_size, "b")
|
||||
tensor_ref_C = getTensorRef(tensor_C, self.layout_C, self.conv_kind, problem_size, "c")
|
||||
tensor_ref_D_ref = getTensorRef(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
|
||||
|
||||
self.host_conv2d(
|
||||
self.conv_kind, problem_size,
|
||||
tensor_ref_A, tensor_ref_B, tensor_ref_C, tensor_ref_D_ref,
|
||||
alpha, beta
|
||||
)
|
||||
|
||||
tensor_view_D_ref = getTensorView(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
|
||||
|
||||
if self.enable_cached_results:
|
||||
cached_test_result.D = cutlass.test.conv.host.TensorHash(tensor_view_D_ref)
|
||||
cached_results = cutlass.test.conv.host.CachedTestResultListing(conv2d_result_cache_name)
|
||||
cached_results.append(cached_test_key, cached_test_result)
|
||||
cached_results.write(conv2d_result_cache_name)
|
||||
else:
|
||||
return tensor_D_ref
|
||||
|
||||
return cached_test_result.D
|
||||
|
||||
def equal(self, tensor_D, tensor_D_ref, problem_size):
|
||||
if self.enable_cached_results:
|
||||
tensor_view_D = getTensorView(tensor_D, self.layout_D, self.conv_kind, problem_size, "d")
|
||||
tensor_D_hash = cutlass.test.conv.host.TensorHash(tensor_view_D)
|
||||
|
||||
return tensor_D_hash == tensor_D_ref
|
||||
else:
|
||||
tensor_view_D = getTensorView(tensor_D, self.layout_D, self.conv_kind, problem_size, "d")
|
||||
tensor_view_D_ref = getTensorView(tensor_D_ref, self.layout_D, self.conv_kind, problem_size, "d")
|
||||
return cutlass.test.conv.host.equals(tensor_view_D, tensor_view_D_ref)
|
||||
|
||||
def run_cutlass_profiler(self, problem_size, split_k_mode=cutlass.conv.SplitKMode.Serial, alpha=1.0, beta=0.0):
|
||||
|
||||
if split_k_mode == cutlass.conv.SplitKMode.Serial:
|
||||
split_k_mode_ = "serial"
|
||||
else:
|
||||
split_k_mode_ = "parallel"
|
||||
|
||||
cutlass_path = os.getenv('CUTLASS_PATH')
|
||||
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
|
||||
|
||||
values = {
|
||||
"profiler_path": cutlass_path + "/build/tools/profiler/cutlass_profiler",
|
||||
"kernel_name": self.operation.procedural_name(),
|
||||
"verification_providers": "device",
|
||||
"provider": "cutlass",
|
||||
'n': str(problem_size.N),
|
||||
'h': str(problem_size.H),
|
||||
'w': str(problem_size.W),
|
||||
'c': str(problem_size.C),
|
||||
'k': str(problem_size.K),
|
||||
'r': str(problem_size.R),
|
||||
's': str(problem_size.S),
|
||||
'p': str(problem_size.P),
|
||||
'q': str(problem_size.Q),
|
||||
'pad_h': str(problem_size.pad_h),
|
||||
'pad_w': str(problem_size.pad_w),
|
||||
'stride_h': str(problem_size.stride_h),
|
||||
'stride_w': str(problem_size.stride_w),
|
||||
'dilation_h': str(problem_size.dilation_h),
|
||||
'dilation_w': str(problem_size.dilation_w),
|
||||
'split_k_slices': str(problem_size.split_k_slices),
|
||||
'split_k_mode': split_k_mode_,
|
||||
'alpha': str(alpha),
|
||||
'beta': str(beta),
|
||||
'warmup': str(self.warmup_iterations),
|
||||
'profile': str(self.iterations)
|
||||
}
|
||||
|
||||
cmd_template = \
|
||||
"${profiler_path} --kernels=${kernel_name} --verification-providers=${verification_providers}" \
|
||||
" --providers=${provider} --n=${n} --h=${h} --w=${w} --c=${c} --k=${k} --r=${r} --s=${s} --p=${p}" \
|
||||
" --q=${q} --pad_h=${pad_h} --pad_w=${pad_w} --stride_h={stride_h} --stride_w=${stride_w}" \
|
||||
" --dilation_h=${dilation_h} --dilation_w=${dilation_w} --warmup-iterations=${warmup} --profiling-iterations=${profile}" \
|
||||
" --split_k_slices=${split_k_slices} --alpha=${alpha} --beta=${beta} --split_k_mode=${split_k_mode}"
|
||||
|
||||
cmd = SubstituteTemplate(cmd_template, values)
|
||||
result = subprocess.getoutput(cmd)
|
||||
|
||||
m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
|
||||
runtime = float(m.group('runtime'))
|
||||
|
||||
m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
|
||||
bytes = int(m.group('bytes'))
|
||||
|
||||
m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
|
||||
flops = int(m.group('flops'))
|
||||
|
||||
# check if the problem size matches
|
||||
assert bytes == self.bytes(problem_size, alpha, beta)
|
||||
assert flops == self.flops(problem_size)
|
||||
|
||||
return runtime
|
||||
|
||||
|
||||
|
||||
def run(self, problem_size, split_k_mode=cutlass.conv.SplitKMode.Serial,
|
||||
alpha=1.0, beta=0.0):
|
||||
|
||||
assert get_allocated_size() == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
|
||||
|
||||
#
|
||||
# Initialize input and output tensors
|
||||
#
|
||||
tensor_A_size = cutlass.conv.implicit_gemm_tensor_a_size(self.conv_kind, problem_size)
|
||||
tensor_B_size = cutlass.conv.implicit_gemm_tensor_b_size(self.conv_kind, problem_size)
|
||||
tensor_C_size = cutlass.conv.implicit_gemm_tensor_c_size(self.conv_kind, problem_size)
|
||||
|
||||
np.random.seed(self.seed)
|
||||
|
||||
tensor_A = self.uniform_init(size=(tensor_A_size,), dtype=self.dtype_A)
|
||||
tensor_B = self.uniform_init(size=(tensor_B_size,), dtype=self.dtype_B)
|
||||
tensor_C = self.uniform_init(size=(tensor_C_size,), dtype=self.dtype_C)
|
||||
tensor_D = np.zeros(shape=(tensor_C_size,), dtype=self.dtype_D)
|
||||
|
||||
|
||||
#
|
||||
# Launch kernel
|
||||
#
|
||||
|
||||
arguments = Conv2dArguments(
|
||||
operation=self.operation, problem_size=problem_size, A=tensor_A,
|
||||
B=tensor_B, C=tensor_C, D=tensor_D,
|
||||
output_op = self.operation.epilogue_type(alpha, beta),
|
||||
split_k_slices=problem_size.split_k_slices,
|
||||
split_k_mode=split_k_mode
|
||||
)
|
||||
|
||||
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
|
||||
implicit_gemm_size = cutlass.conv.implicit_gemm_problem_size(self.operation.conv_kind, arguments.problem_size)
|
||||
reduction_arguments = ReductionArguments(
|
||||
self.reduction_operation,
|
||||
problem_size=[implicit_gemm_size.m(), implicit_gemm_size.n()], partitions=problem_size.split_k_slices,
|
||||
workspace=arguments.ptr_D,
|
||||
destination=tensor_D,
|
||||
source=tensor_C,
|
||||
output_op = self.reduction_operation.epilogue_type(alpha, beta)
|
||||
)
|
||||
|
||||
self.operation.run(arguments)
|
||||
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
|
||||
self.reduction_operation.run(reduction_arguments)
|
||||
|
||||
passed = True
|
||||
if self.verification:
|
||||
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
|
||||
reduction_arguments.sync()
|
||||
else:
|
||||
arguments.sync()
|
||||
|
||||
tensor_D_ref = self.host_reference(problem_size, tensor_A, tensor_B, tensor_C, alpha, beta)
|
||||
|
||||
passed = self.equal(tensor_D, tensor_D_ref, problem_size)
|
||||
|
||||
try:
|
||||
assert passed
|
||||
except AssertionError:
|
||||
self.print_problem_size(problem_size, split_k_mode)
|
||||
|
||||
if self.profiling:
|
||||
sleep(self.sleep_time)
|
||||
for _ in range(self.warmup_iterations):
|
||||
self.operation.run(arguments)
|
||||
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
|
||||
self.reduction_operation.run(reduction_arguments)
|
||||
|
||||
self.timer.start()
|
||||
for _ in range(self.warmup_iterations):
|
||||
self.operation.run(arguments)
|
||||
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
|
||||
self.reduction_operation.run(reduction_arguments)
|
||||
self.timer.stop_and_wait()
|
||||
runtime = self.timer.duration(self.iterations)
|
||||
|
||||
# free memory
|
||||
del arguments
|
||||
if split_k_mode == cutlass.conv.SplitKMode.Parallel:
|
||||
del reduction_arguments
|
||||
|
||||
assert get_allocated_size() == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
|
||||
if self.profiling:
|
||||
return runtime
|
||||
return passed
|
||||
|
||||
|
||||
|
||||
########################################################################################################
|
||||
# TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
|
||||
# TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
|
||||
# Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes
|
||||
# (conv_blacklist_sizes)
|
||||
############################################################################################################
|
||||
|
||||
def test_all_conv2d(operation: Conv2dOperation, conv_test_sizes = [], interleaved=False):
|
||||
passed = True
|
||||
#
|
||||
# Testbed object
|
||||
#
|
||||
|
||||
testbed = Conv2dLauncher(operation, interleaved=interleaved)
|
||||
|
||||
#
|
||||
# Get conv problem sizes to run conv operator
|
||||
#
|
||||
|
||||
conv_problems = cutlass.test.conv.TestbedConv2dProblemSizes(64)
|
||||
|
||||
# Vector of conv2d problem sizes to avoid duplicate runs
|
||||
conv_tested_sizes = []
|
||||
|
||||
# Flatten 2D problem_vectors into a 1D problem sizes
|
||||
problem_sizes = conv_problems.conv2d_default_sizes
|
||||
|
||||
problem_sizes = [conv_problem for conv_problem in problem_sizes] + conv_test_sizes
|
||||
|
||||
# Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slices=1, alpha=1.0, beta=0.0)
|
||||
for conv_problem in problem_sizes:
|
||||
|
||||
if conv_problem in conv_tested_sizes:
|
||||
continue
|
||||
|
||||
# skip channel dimension % 32 != 0 for interleaved case
|
||||
if interleaved:
|
||||
if conv_problem.K % 32 != 0 or conv_problem.C % 32 != 0:
|
||||
continue
|
||||
|
||||
#
|
||||
# Procedurally disable certain cases
|
||||
#
|
||||
|
||||
# CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1}
|
||||
if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Unity:
|
||||
if not ((conv_problem.stride_h == 1) and (conv_problem.stride_w == 1)):
|
||||
continue
|
||||
|
||||
if not interleaved:
|
||||
# Fixed channels algorithm requires channel count to match access size
|
||||
if operation.iterator_algorithm == cutlass.conv.IteratorAlgorithm.fixed_channels:
|
||||
if conv_problem.C != operation.A.alignment:
|
||||
continue
|
||||
|
||||
# Few channels algorithm requires channel count to match access size
|
||||
if operation.iterator_algorithm == cutlass.conv.IteratorAlgorithm.few_channels:
|
||||
if conv_problem.C % operation.A.alignment:
|
||||
continue
|
||||
|
||||
# CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w}
|
||||
# Although strided dgrad works for all stride combinations, we are only going
|
||||
# to run strided dgrad for non-unity strides
|
||||
|
||||
if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Strided:
|
||||
if (conv_problem.stride_h == 1) and (conv_problem.stride_w == 1):
|
||||
continue
|
||||
|
||||
#
|
||||
# Test
|
||||
#
|
||||
|
||||
# push back tested problem size to avoid re-running duplicates
|
||||
conv_tested_sizes.append(conv_problem)
|
||||
|
||||
passed = testbed.run(conv_problem)
|
||||
|
||||
if not passed:
|
||||
return False
|
||||
|
||||
if interleaved:
|
||||
return True
|
||||
#
|
||||
# filter the cases for split K
|
||||
#
|
||||
|
||||
# Small-channels convolution can't run here.
|
||||
if operation.iterator_algorithm in [cutlass.conv.IteratorAlgorithm.fixed_channels, cutlass.conv.IteratorAlgorithm.few_channels]:
|
||||
return True
|
||||
|
||||
# CUTLASS DGRAD's *stride* specialization does not support split-k mode
|
||||
if operation.conv_kind == cutlass.conv.Operator.dgrad and operation.stride_support == StrideSupport.Strided:
|
||||
conv_problem = cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 56, 56, 8),
|
||||
cutlass.Tensor4DCoord(8, 1, 1, 8),
|
||||
cutlass.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass.MatrixCoord(2, 2),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
)
|
||||
passed = testbed.run(conv_problem)
|
||||
|
||||
return passed
|
||||
|
||||
# Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for
|
||||
# a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters
|
||||
# which are abolutely neccessary to catch functional bugs. The below code does provide option to sweep
|
||||
# alpha and beta for local testing, but only runs one value for alpha and beta.
|
||||
|
||||
conv2d_split_k_test_size = cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 17, 11, 288),
|
||||
cutlass.Tensor4DCoord(160, 3, 3, 288),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
)
|
||||
|
||||
split_k_modes = [cutlass.conv.SplitKMode.Parallel, cutlass.conv.SplitKMode.Serial]
|
||||
|
||||
split_k_slices = [1, 2, 3, 4, 201]
|
||||
problem_alpha = [2.0,]
|
||||
problem_beta = [2.0,]
|
||||
|
||||
for split_k_mode in split_k_modes:
|
||||
for split_k_slice in split_k_slices:
|
||||
for alpha in problem_alpha:
|
||||
for beta in problem_beta:
|
||||
passed = testbed.run(conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
|
||||
split_k_mode,
|
||||
alpha, beta)
|
||||
|
||||
return passed
|
||||
@ -1,235 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import pycutlass
|
||||
from pycutlass.test.gemm_testbed import getTensorRef, getTensorView, transpose
|
||||
from pycutlass import *
|
||||
import numpy as np
|
||||
import cutlass
|
||||
from bfloat16 import bfloat16
|
||||
|
||||
|
||||
class TestbedGrouped:
|
||||
def __init__(self, operation: GemmOperationGrouped, seed: int = 2080) -> None:
|
||||
|
||||
pycutlass.compiler.add_module([operation])
|
||||
|
||||
self.seed = seed
|
||||
|
||||
self.operation = operation
|
||||
|
||||
element_size = DataTypeSize[operation.A.element]
|
||||
|
||||
self.dtype_A = self.numpy_type(operation.A.element)
|
||||
self.dtype_B = self.numpy_type(operation.B.element)
|
||||
self.dtype_C = self.numpy_type(operation.C.element)
|
||||
self.dtype_D = self.numpy_type(operation.C.element)
|
||||
|
||||
if element_size == 1:
|
||||
self.scope_max = 1
|
||||
self.scope_min = 0
|
||||
elif element_size <= 8:
|
||||
self.scope_max = 1
|
||||
self.scope_min = -1
|
||||
elif element_size == 16:
|
||||
self.scope_max = 4
|
||||
self.scope_min = -4
|
||||
else:
|
||||
self.scope_max = 8
|
||||
self.scope_min = -8
|
||||
|
||||
#: compute type
|
||||
self.compute_type = operation.epilogue_functor.element_epilogue
|
||||
|
||||
self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
|
||||
|
||||
@staticmethod
|
||||
def numpy_type(type):
|
||||
if type == cutlass.float64:
|
||||
return np.float64
|
||||
elif type == cutlass.float32:
|
||||
return np.float32
|
||||
elif type == cutlass.float16:
|
||||
return np.float16
|
||||
elif type == cutlass.bfloat16:
|
||||
return bfloat16
|
||||
elif type == cutlass.int32:
|
||||
return np.int32
|
||||
elif type == cutlass.int8:
|
||||
return np.int8
|
||||
else:
|
||||
raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
|
||||
|
||||
def uniform_init(self, size, dtype):
|
||||
if dtype in [np.float32, np.float16, bfloat16, np.float64]:
|
||||
return np.ceil(
|
||||
np.random.uniform(
|
||||
low=self.scope_min - 0.5, high=self.scope_max - 0.5,
|
||||
size=size).astype(dtype)
|
||||
)
|
||||
else:
|
||||
return np.random.uniform(
|
||||
low=self.scope_min - 1, high=self.scope_max + 1,
|
||||
size=size).astype(dtype)
|
||||
|
||||
def print_problem_size(self, p):
|
||||
problem_size = "problem: %d, %d, %d\n" % (p.m(), p.n(), p.k())
|
||||
print(problem_size)
|
||||
|
||||
def run(self, problem_count: int, alpha: float = 1.0, beta: float = 0.0) -> bool:
|
||||
|
||||
assert get_allocated_size(
|
||||
) == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
|
||||
|
||||
# initialize
|
||||
np.random.seed(self.seed)
|
||||
|
||||
# generate the problem sizes
|
||||
problem_sizes = []
|
||||
tensor_As = []
|
||||
tensor_Bs = []
|
||||
tensor_Cs = []
|
||||
tensor_Ds = []
|
||||
tensor_D_refs = []
|
||||
|
||||
for i in range(problem_count):
|
||||
if self.dtype_A == np.int8:
|
||||
if i == 0:
|
||||
problem_size = cutlass.gemm.GemmCoord(48, 16, 32)
|
||||
else:
|
||||
problem_size = cutlass.gemm.GemmCoord(
|
||||
16 * np.random.randint(0, 64) + 48,
|
||||
16 * np.random.randint(0, 64) + 48,
|
||||
16 * np.random.randint(0, 64) + 48
|
||||
)
|
||||
else:
|
||||
if i == 0:
|
||||
problem_size = cutlass.gemm.GemmCoord(48, 16, 8)
|
||||
else:
|
||||
problem_size = cutlass.gemm.GemmCoord(
|
||||
8 * np.random.randint(0, 64) + 24,
|
||||
8 * np.random.randint(0, 64) + 24,
|
||||
8 * np.random.randint(0, 64) + 24
|
||||
)
|
||||
|
||||
tensor_As.append(
|
||||
self.uniform_init(
|
||||
size=(problem_size.m() * problem_size.k(),),
|
||||
dtype=self.dtype_A)
|
||||
)
|
||||
tensor_Bs.append(
|
||||
self.uniform_init(
|
||||
size=(problem_size.n() * problem_size.k(),),
|
||||
dtype=self.dtype_B)
|
||||
)
|
||||
tensor_Cs.append(
|
||||
self.uniform_init(
|
||||
size=(problem_size.m() * problem_size.n(),),
|
||||
dtype=self.dtype_C)
|
||||
)
|
||||
|
||||
tensor_Ds.append(
|
||||
np.zeros(
|
||||
shape=(problem_size.m() * problem_size.n(),),
|
||||
dtype=self.dtype_D
|
||||
)
|
||||
)
|
||||
|
||||
tensor_D_refs.append(
|
||||
np.ones(
|
||||
shape=(problem_size.m() * problem_size.n(),),
|
||||
dtype=self.dtype_D
|
||||
)
|
||||
)
|
||||
|
||||
problem_sizes.append(problem_size)
|
||||
|
||||
arguments = GemmGroupedArguments(
|
||||
operation=self.operation, problem_sizes=problem_sizes,
|
||||
A=tensor_As, B=tensor_Bs, C=tensor_Cs, D=tensor_Ds,
|
||||
output_op=self.operation.epilogue_type(alpha, beta)
|
||||
)
|
||||
|
||||
self.operation.run(arguments)
|
||||
|
||||
arguments.sync()
|
||||
|
||||
#
|
||||
# Reference check
|
||||
#
|
||||
alpha = self.compute_type(alpha).value()
|
||||
beta = self.compute_type(beta).value()
|
||||
init_acc = self.accumulator_type(0).value()
|
||||
|
||||
for idx, problem_size in enumerate(problem_sizes):
|
||||
if self.operation.switched:
|
||||
tensor_ref_A = getTensorRef(
|
||||
tensor_As[idx], problem_size, "a", transpose(self.operation.B.layout))
|
||||
tensor_ref_B = getTensorRef(
|
||||
tensor_Bs[idx], problem_size, "b", transpose(self.operation.A.layout))
|
||||
tensor_ref_C = getTensorRef(
|
||||
tensor_Cs[idx], problem_size, "c", transpose(self.operation.C.layout))
|
||||
tensor_ref_D_ref = getTensorRef(
|
||||
tensor_D_refs[idx], problem_size, "d", transpose(self.operation.C.layout))
|
||||
else:
|
||||
tensor_ref_A = getTensorRef(
|
||||
tensor_As[idx], problem_size, "a", self.operation.A.layout)
|
||||
tensor_ref_B = getTensorRef(
|
||||
tensor_Bs[idx], problem_size, "b", self.operation.B.layout)
|
||||
tensor_ref_C = getTensorRef(
|
||||
tensor_Cs[idx], problem_size, "c", self.operation.C.layout)
|
||||
tensor_ref_D_ref = getTensorRef(
|
||||
tensor_D_refs[idx], problem_size, "d", self.operation.C.layout)
|
||||
|
||||
tensor_view_D_ref = getTensorView(
|
||||
tensor_D_refs[idx], problem_size, "d", self.operation.C.layout)
|
||||
|
||||
cutlass.test.gemm.host.gemm(problem_size, alpha, tensor_ref_A,
|
||||
tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
|
||||
|
||||
tensor_view_D = getTensorView(
|
||||
tensor_Ds[idx], problem_size, "d", self.operation.C.layout)
|
||||
|
||||
passed = cutlass.test.gemm.host.equals(
|
||||
tensor_view_D, tensor_view_D_ref)
|
||||
|
||||
try:
|
||||
assert passed
|
||||
except AssertionError:
|
||||
self.print_problem_size(problem_size)
|
||||
|
||||
del arguments
|
||||
|
||||
assert get_allocated_size(
|
||||
) == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
|
||||
|
||||
return passed
|
||||
@ -1,594 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
from time import sleep
|
||||
import pycutlass
|
||||
from pycutlass import *
|
||||
import pycutlass.utils.datatypes as datatypes
|
||||
import cutlass
|
||||
from cuda import cudart
|
||||
from cuda import cuda
|
||||
from bfloat16 import bfloat16
|
||||
from .profiler import GpuTimer
|
||||
import subprocess
|
||||
|
||||
|
||||
def transpose(layout):
|
||||
if layout == cutlass.RowMajor:
|
||||
return cutlass.ColumnMajor
|
||||
elif layout == cutlass.ColumnMajor:
|
||||
return cutlass.RowMajor
|
||||
elif layout == cutlass.ColumnMajorInterleaved32:
|
||||
return cutlass.RowMajorInterleaved32
|
||||
elif layout == cutlass.RowMajorInterleaved32:
|
||||
return cutlass.ColumnMajorInterleaved32
|
||||
|
||||
|
||||
def getTensorRef(tensor: np.ndarray, problem_size: cutlass.gemm.GemmCoord, operand: str, layout: cutlass.layout, batch_offset: int = 0):
|
||||
ptr = tensor.__array_interface__['data'][0]
|
||||
if operand == "a":
|
||||
tensor_coord = problem_size.mk()
|
||||
batch_stride = problem_size.m() * problem_size.k()
|
||||
elif operand == "b":
|
||||
tensor_coord = problem_size.kn()
|
||||
batch_stride = problem_size.k() * problem_size.n()
|
||||
elif operand in ["c", "d"]:
|
||||
tensor_coord = problem_size.mn()
|
||||
batch_stride = problem_size.m() * problem_size.n()
|
||||
else:
|
||||
raise ValueError("Unknown operand: " + operand)
|
||||
|
||||
elt_size = DataTypeSizeBytes[datatypes.to_cutlass(tensor.dtype)]
|
||||
ptr += batch_offset * batch_stride * elt_size
|
||||
|
||||
if layout == cutlass.RowMajor:
|
||||
layout = cutlass.RowMajor.packed(tensor_coord)
|
||||
layout_tag = "RowMajor"
|
||||
elif layout == cutlass.ColumnMajor:
|
||||
layout = cutlass.ColumnMajor.packed(tensor_coord)
|
||||
layout_tag = "ColumnMajor"
|
||||
elif layout == cutlass.ColumnMajorInterleaved32:
|
||||
layout = cutlass.ColumnMajorInterleaved32.packed(tensor_coord)
|
||||
layout_tag = "ColumnMajorInterleaved32"
|
||||
elif layout == cutlass.RowMajorInterleaved32:
|
||||
layout = cutlass.RowMajorInterleaved32.packed(tensor_coord)
|
||||
layout_tag = "RowMajorInterleaved32"
|
||||
else:
|
||||
raise ValueError("unsupported layout")
|
||||
if tensor.dtype == np.float32:
|
||||
ref_name = "TensorRefF32" + layout_tag
|
||||
elif tensor.dtype == np.float64:
|
||||
ref_name = "TensorRefF64" + layout_tag
|
||||
elif tensor.dtype == np.float16:
|
||||
ref_name = "TensorRefF16" + layout_tag
|
||||
elif tensor.dtype == bfloat16:
|
||||
ref_name = "TensorRefBF16" + layout_tag
|
||||
elif tensor.dtype == np.int8:
|
||||
ref_name = "TensorRefS8" + layout_tag
|
||||
elif tensor.dtype == np.int32:
|
||||
ref_name = "TensorRefS32" + layout_tag
|
||||
else:
|
||||
raise ValueError("unsupported datatype %s" %
|
||||
ShortDataTypeNames[tensor.dtype])
|
||||
|
||||
return getattr(cutlass, ref_name)(ptr, layout)
|
||||
|
||||
|
||||
def getTensorView(tensor: np.ndarray, problem_size: cutlass.gemm.GemmCoord, operand: str, layout: str, batch_offset: int = 0):
|
||||
tensor_ref = getTensorRef(tensor, problem_size, operand, layout, batch_offset)
|
||||
|
||||
if operand == "a":
|
||||
tensor_coord = problem_size.mk()
|
||||
elif operand == "b":
|
||||
tensor_coord = problem_size.kn()
|
||||
elif operand in ["c", "d"]:
|
||||
tensor_coord = problem_size.mn()
|
||||
else:
|
||||
raise ValueError("Unknown operand: " + operand)
|
||||
|
||||
if layout == cutlass.RowMajor:
|
||||
layout_tag = "RowMajor"
|
||||
elif layout == cutlass.ColumnMajor:
|
||||
layout_tag = "ColumnMajor"
|
||||
elif layout == cutlass.ColumnMajorInterleaved32:
|
||||
layout_tag = "ColumnMajorInterleaved32"
|
||||
elif layout == cutlass.RowMajorInterleaved32:
|
||||
layout_tag = "RowMajorInterleaved32"
|
||||
else:
|
||||
raise ValueError("unsupported layout")
|
||||
if tensor.dtype == np.float32:
|
||||
ref_name = "TensorViewF32" + layout_tag
|
||||
elif tensor.dtype == np.float64:
|
||||
ref_name = "TensorViewF64" + layout_tag
|
||||
elif tensor.dtype == np.float16:
|
||||
ref_name = "TensorViewF16" + layout_tag
|
||||
elif tensor.dtype == bfloat16:
|
||||
ref_name = "TensorViewBF16" + layout_tag
|
||||
elif tensor.dtype == np.int32:
|
||||
ref_name = "TensorViewS32" + layout_tag
|
||||
elif tensor.dtype == np.int8:
|
||||
ref_name = "TensorViewS8" + layout_tag
|
||||
else:
|
||||
raise ValueError("unsupported datatype")
|
||||
|
||||
return getattr(cutlass, ref_name)(tensor_ref, tensor_coord)
|
||||
|
||||
|
||||
class GemmUniversalLauncher:
|
||||
def __init__(self, operation: 'GemmOperationUniversal', seed: int = 2080, interleaved=False,
|
||||
verification=True, profiling=False, warmup_iterations=500, iterations=500, **kwargs) -> None:
|
||||
# create the reduction kernel
|
||||
self.reduction_operation: ReductionOperation = ReductionOperation(
|
||||
shape=cutlass.MatrixCoord(4, 32 * operation.C.alignment),
|
||||
C=operation.C, element_accumulator=operation.tile_description.math_instruction.element_accumulator,
|
||||
element_compute=operation.epilogue_functor.element_epilogue, epilogue_functor=operation.epilogue_functor,
|
||||
count=operation.C.alignment
|
||||
)
|
||||
|
||||
self.math_operation = operation.tile_description.math_instruction.math_operation
|
||||
|
||||
#: verify the output result
|
||||
self.verification = verification
|
||||
#: profile the kernel's runtime
|
||||
self.profiling = profiling
|
||||
|
||||
self.timer = GpuTimer()
|
||||
|
||||
self.warmup_iterations = warmup_iterations
|
||||
self.iterations = iterations
|
||||
|
||||
if "sleep" in kwargs.keys():
|
||||
self.sleep_time = kwargs["sleep"]
|
||||
else:
|
||||
self.sleep_time = 0
|
||||
|
||||
#
|
||||
# Compile the operator
|
||||
#
|
||||
|
||||
op_list = [operation]
|
||||
if operation.arch < 90:
|
||||
# Split K via Python is currently only supported for pre-SM90 kernels
|
||||
op_list.append(self.reduction_operation)
|
||||
|
||||
pycutlass.compiler.add_module(op_list)
|
||||
|
||||
self.operation = operation
|
||||
|
||||
self.dtype_A = GemmUniversalLauncher.numpy_type(operation.A.element)
|
||||
self.dtype_B = GemmUniversalLauncher.numpy_type(operation.B.element)
|
||||
self.dtype_C = GemmUniversalLauncher.numpy_type(operation.C.element)
|
||||
self.dtype_D = GemmUniversalLauncher.numpy_type(operation.C.element)
|
||||
|
||||
accumulator_size = DataTypeSize[operation.tile_description.math_instruction.element_accumulator]
|
||||
element_size = DataTypeSize[operation.A.element]
|
||||
|
||||
if element_size == 1:
|
||||
self.scope_max = 1
|
||||
self.scope_min = 0
|
||||
elif element_size <= 8:
|
||||
self.scope_max = 1
|
||||
self.scope_min = -1
|
||||
elif element_size == 16:
|
||||
self.scope_max = 4
|
||||
self.scope_min = -4
|
||||
else:
|
||||
self.scope_max = 8
|
||||
self.scope_min = -8
|
||||
|
||||
#: seed
|
||||
self.seed: int = seed
|
||||
|
||||
#: whether the layout is interleaved
|
||||
self.interleaved = interleaved
|
||||
|
||||
#: compute type
|
||||
self.compute_type = operation.epilogue_functor.element_epilogue
|
||||
self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
|
||||
|
||||
def print_problem_size(self, p, mode, batch_count):
|
||||
if mode == cutlass.gemm.Mode.Gemm:
|
||||
mode = "Gemm"
|
||||
elif mode == cutlass.gemm.Mode.Batched:
|
||||
mode = "GemmBatched"
|
||||
elif mode == cutlass.gemm.Mode.GemmSplitKParallel:
|
||||
mode = "GemmSplitKParallel"
|
||||
problem_size = "problem: %d, %d, %d\n batch_count: %d\n mode: %s" % (
|
||||
p.m(), p.n(), p.k(), batch_count, mode)
|
||||
print(problem_size)
|
||||
|
||||
@staticmethod
|
||||
def numpy_type(type):
|
||||
if type == cutlass.float64:
|
||||
return np.float64
|
||||
elif type == cutlass.float32:
|
||||
return np.float32
|
||||
elif type == cutlass.float16:
|
||||
return np.float16
|
||||
elif type == cutlass.bfloat16:
|
||||
return bfloat16
|
||||
elif type == cutlass.int32:
|
||||
return np.int32
|
||||
elif type == cutlass.int8:
|
||||
return np.int8
|
||||
else:
|
||||
raise ValueError("unsupported type: %s" % ShortDataTypeNames[type])
|
||||
|
||||
def uniform_init(self, size, dtype):
|
||||
if dtype in [np.float32, np.float16, bfloat16, np.float64]:
|
||||
return np.ceil(
|
||||
np.random.uniform(
|
||||
low=self.scope_min - 0.5, high=self.scope_max - 0.5,
|
||||
size=size).astype(dtype)
|
||||
)
|
||||
else:
|
||||
return np.random.uniform(
|
||||
low=self.scope_min - 1, high=self.scope_max + 1,
|
||||
size=size).astype(dtype)
|
||||
|
||||
def reorder_tensor_B(self, tensor_B, problem_size):
|
||||
reordered_tensor_B = np.empty_like(tensor_B)
|
||||
tensor_ref_B = getTensorRef(
|
||||
tensor_B, problem_size, "b", self.operation.B.layout)
|
||||
reordered_tensor_ref_B = getTensorRef(
|
||||
reordered_tensor_B, problem_size, "b", self.operation.B.layout)
|
||||
cutlass.gemm.host.reorder_column(
|
||||
tensor_ref_B, reordered_tensor_ref_B, problem_size)
|
||||
return reordered_tensor_B
|
||||
|
||||
def host_reference(self, problem_size, batch_count, tensor_A, tensor_B, tensor_C, alpha, beta):
|
||||
tensor_D_ref = np.ones_like(tensor_C)
|
||||
alpha = self.numpy_type(self.compute_type)(alpha)
|
||||
beta = self.numpy_type(self.compute_type)(beta)
|
||||
init_acc = 0
|
||||
|
||||
alpha = self.compute_type(alpha).value()
|
||||
beta = self.compute_type(beta).value()
|
||||
init_acc = self.accumulator_type(init_acc).value()
|
||||
|
||||
for i in range(batch_count):
|
||||
if self.operation.switched:
|
||||
tensor_ref_A = getTensorRef(
|
||||
tensor_A, problem_size, "a", transpose(self.operation.B.layout), batch_offset=i)
|
||||
tensor_ref_B = getTensorRef(
|
||||
tensor_B, problem_size, "b", transpose(self.operation.A.layout), batch_offset=i)
|
||||
tensor_ref_C = getTensorRef(
|
||||
tensor_C, problem_size, "c", transpose(self.operation.C.layout), batch_offset=i)
|
||||
tensor_ref_D_ref = getTensorRef(
|
||||
tensor_D_ref, problem_size, "d", transpose(self.operation.C.layout), batch_offset=i)
|
||||
else:
|
||||
tensor_ref_A = getTensorRef(
|
||||
tensor_A, problem_size, "a", self.operation.A.layout, batch_offset=i)
|
||||
tensor_ref_B = getTensorRef(
|
||||
tensor_B, problem_size, "b", self.operation.B.layout, batch_offset=i)
|
||||
tensor_ref_C = getTensorRef(
|
||||
tensor_C, problem_size, "c", self.operation.C.layout, batch_offset=i)
|
||||
tensor_ref_D_ref = getTensorRef(
|
||||
tensor_D_ref, problem_size, "d", self.operation.C.layout, batch_offset=i)
|
||||
|
||||
if self.math_operation in [MathOperation.multiply_add_saturate]:
|
||||
cutlass.test.gemm.host.gemm_saturate(
|
||||
problem_size, alpha, tensor_ref_A, tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
|
||||
else:
|
||||
cutlass.test.gemm.host.gemm(problem_size, alpha, tensor_ref_A,
|
||||
tensor_ref_B, beta, tensor_ref_C, tensor_ref_D_ref, init_acc)
|
||||
|
||||
return tensor_D_ref
|
||||
|
||||
def equal(self, tensor_D, tensor_D_ref, problem_size, batch_count):
|
||||
for i in range(batch_count):
|
||||
tensor_view_D = getTensorView(
|
||||
tensor_D, problem_size, "d", self.operation.C.layout, batch_offset=i)
|
||||
tensor_view_D_ref = getTensorView(
|
||||
tensor_D_ref, problem_size, "d", self.operation.C.layout, batch_offset=i)
|
||||
|
||||
if not cutlass.test.gemm.host.equals(tensor_view_D, tensor_view_D_ref):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def bytes(self, problem_size, batch_count=1, alpha=1.0, beta=0.0):
|
||||
m = problem_size.m()
|
||||
n = problem_size.n()
|
||||
k = problem_size.k()
|
||||
|
||||
bytes = \
|
||||
(DataTypeSize[self.operation.A.element] * m // 8) * k + \
|
||||
(DataTypeSize[self.operation.B.element] * n // 8) * k + \
|
||||
(DataTypeSize[self.operation.C.element] * m // 8) * n
|
||||
|
||||
if beta != 0:
|
||||
bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
|
||||
|
||||
bytes *= batch_count
|
||||
|
||||
return bytes
|
||||
|
||||
def flops(self, problem_size, batch_count=1):
|
||||
m = problem_size.m()
|
||||
n = problem_size.n()
|
||||
k = problem_size.k()
|
||||
|
||||
flops_ = (m * n * k) * 2 * batch_count
|
||||
|
||||
return flops_
|
||||
|
||||
def run_cutlass_profiler(self, mode, problem_size, batch_count=1, alpha=1.0, beta=0.0):
|
||||
|
||||
cutlass_path = os.getenv('CUTLASS_PATH')
|
||||
assert cutlass_path is not None, "Environment variable 'CUTLASS_PATH' is not defined."
|
||||
|
||||
values = {
|
||||
"profiler_path": cutlass_path + "/build/tools/profiler/cutlass_profiler",
|
||||
"kernel_name": self.operation.procedural_name(),
|
||||
"verification_providers": "device",
|
||||
"provider": "cutlass",
|
||||
"m": str(problem_size.m()),
|
||||
"n": str(problem_size.n()),
|
||||
"k": str(problem_size.k()),
|
||||
'split_k_slices': str(batch_count),
|
||||
'alpha': str(alpha),
|
||||
'beta': str(beta),
|
||||
'warmup': str(self.warmup_iterations),
|
||||
'profile': str(self.iterations)
|
||||
}
|
||||
|
||||
cmd_template = \
|
||||
"${profiler_path} --kernels=${kernel_name} --verification-providers=${verification_providers}" \
|
||||
" --providers=${provider} --m=${m} --n=${n} --k=${k}"
|
||||
|
||||
cmd = SubstituteTemplate(cmd_template, values)
|
||||
result = subprocess.getoutput(cmd)
|
||||
|
||||
m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
|
||||
runtime = float(m.group('runtime'))
|
||||
|
||||
m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
|
||||
bytes = int(m.group('bytes'))
|
||||
|
||||
m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
|
||||
flops = int(m.group('flops'))
|
||||
|
||||
# check if the problem size matches
|
||||
assert bytes == self.bytes(problem_size, alpha, beta)
|
||||
assert flops == self.flops(problem_size)
|
||||
|
||||
return runtime
|
||||
|
||||
def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
|
||||
assert get_allocated_size(
|
||||
) == 0, "%d byte of pool memory is not released in previous run" % get_allocated_size()
|
||||
|
||||
np.random.seed(self.seed)
|
||||
|
||||
# Assign an actual batch count in cases where we are not running in batched mode.
|
||||
# This is to differentiate between the number of split K slices and the batch count,
|
||||
# which are overloaded within the single `batch_count` variable.
|
||||
true_batch_count = batch_count if mode == cutlass.gemm.Mode.Batched else 1
|
||||
|
||||
tensor_A = self.uniform_init(
|
||||
size=(problem_size.m() * problem_size.k() * true_batch_count,), dtype=self.dtype_A)
|
||||
tensor_B = self.uniform_init(
|
||||
size=(problem_size.n() * problem_size.k() * true_batch_count,), dtype=self.dtype_B)
|
||||
tensor_C = self.uniform_init(
|
||||
size=(problem_size.m() * problem_size.n() * true_batch_count,), dtype=self.dtype_C)
|
||||
tensor_D = np.zeros(
|
||||
shape=(problem_size.m() * problem_size.n() * true_batch_count,), dtype=self.dtype_D)
|
||||
|
||||
#
|
||||
# Launch kernel
|
||||
#
|
||||
|
||||
arguments = GemmArguments(
|
||||
operation=self.operation, problem_size=problem_size,
|
||||
A=tensor_A, B=tensor_B, C=tensor_C, D=tensor_D,
|
||||
output_op=self.operation.epilogue_type(alpha, beta),
|
||||
gemm_mode=mode, split_k_slices=split_k_slices, batch=batch_count
|
||||
)
|
||||
|
||||
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
|
||||
reduction_arguments = ReductionArguments(
|
||||
self.reduction_operation, problem_size=[
|
||||
problem_size.m(), problem_size.n()],
|
||||
partitions=split_k_slices,
|
||||
workspace=arguments.ptr_D,
|
||||
destination=tensor_D,
|
||||
source=tensor_C,
|
||||
output_op=self.reduction_operation.epilogue_type(alpha, beta)
|
||||
)
|
||||
|
||||
self.operation.run(arguments)
|
||||
|
||||
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
|
||||
self.reduction_operation.run(reduction_arguments)
|
||||
|
||||
passed = True
|
||||
|
||||
if self.verification:
|
||||
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
|
||||
reduction_arguments.sync()
|
||||
else:
|
||||
arguments.sync()
|
||||
tensor_D_ref = self.host_reference(
|
||||
problem_size, true_batch_count, tensor_A, tensor_B, tensor_C, alpha, beta)
|
||||
passed = self.equal(tensor_D, tensor_D_ref, problem_size, true_batch_count)
|
||||
|
||||
try:
|
||||
assert passed
|
||||
except AssertionError:
|
||||
self.print_problem_size(problem_size, mode, batch_count)
|
||||
|
||||
if self.profiling:
|
||||
sleep(self.sleep_time)
|
||||
for _ in range(self.warmup_iterations):
|
||||
self.operation.run(arguments)
|
||||
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
|
||||
self.reduction_operation.run(reduction_arguments)
|
||||
|
||||
self.timer.start()
|
||||
for _ in range(self.iterations):
|
||||
self.operation.run(arguments)
|
||||
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
|
||||
self.reduction_operation.run(reduction_arguments)
|
||||
self.timer.stop_and_wait()
|
||||
|
||||
runtime = self.timer.duration(self.iterations)
|
||||
|
||||
# free memory and clear buffers
|
||||
del arguments
|
||||
if mode == cutlass.gemm.Mode.GemmSplitKParallel:
|
||||
del reduction_arguments
|
||||
|
||||
assert get_allocated_size(
|
||||
) == 0, "%d byte of pool memory is not released after current run" % get_allocated_size()
|
||||
|
||||
if self.profiling:
|
||||
return runtime
|
||||
return passed
|
||||
|
||||
|
||||
def test_all_gemm(operation: 'GemmOperationUniversal', testcase="universal"):
|
||||
|
||||
passed = True
|
||||
|
||||
minimum_operand_element_size = min(
|
||||
DataTypeSize[operation.A.element], DataTypeSize[operation.B.element])
|
||||
opcode_class = operation.tile_description.math_instruction.opcode_class
|
||||
|
||||
if opcode_class == cutlass.OpClass.Simt:
|
||||
alignment = 1
|
||||
else:
|
||||
alignment = 128 // minimum_operand_element_size
|
||||
|
||||
# int8_t gemm alignment constraints
|
||||
if opcode_class == cutlass.OpClass.Simt and operation.A.element == cutlass.int8 and operation.A.layout == cutlass.ColumnMajor:
|
||||
alignment_m = 4
|
||||
else:
|
||||
alignment_m = alignment
|
||||
|
||||
if opcode_class == cutlass.OpClass.Simt and operation.B.element == cutlass.int8 and operation.A.layout == cutlass.RowMajor:
|
||||
alignment_n = 4
|
||||
else:
|
||||
alignment_n = alignment
|
||||
|
||||
if opcode_class == cutlass.OpClass.Simt and operation.A.element == cutlass.int8 \
|
||||
and operation.B.element == cutlass.int8 \
|
||||
and (operation.A.layout == cutlass.RowMajor or operation.B.layout == cutlass.ColumnMajor):
|
||||
|
||||
alignment_k = 4
|
||||
else:
|
||||
alignment_k = alignment
|
||||
|
||||
threadblock_k = operation.tile_description.threadblock_shape[2]
|
||||
|
||||
if testcase == "interleaved":
|
||||
if operation.A.layout in [cutlass.ColumnMajorInterleaved32, cutlass.RowMajorInterleaved32]:
|
||||
interleavedk = 32
|
||||
else:
|
||||
raise ValueError("Unknown layout")
|
||||
|
||||
if testcase == "interleaved":
|
||||
modes = [cutlass.gemm.Mode.Gemm, ]
|
||||
problem_size_m = [interleavedk, 512+interleavedk]
|
||||
problem_size_n = [interleavedk, 512+interleavedk]
|
||||
problem_size_k = [interleavedk, threadblock_k *
|
||||
operation.tile_description.stages + interleavedk]
|
||||
problem_alpha = [1.0]
|
||||
problem_beta = [0.0]
|
||||
batch_counts = [1, ]
|
||||
elif testcase == "multistage":
|
||||
modes = [cutlass.gemm.Mode.Gemm, ]
|
||||
problem_size_m = [16, 528]
|
||||
problem_size_n = [16, 528]
|
||||
problem_size_k = [threadblock_k, threadblock_k * operation.tile_description.stages +
|
||||
operation.tile_description.math_instruction.instruction_shape[2]]
|
||||
problem_alpha = [1.0]
|
||||
problem_beta = [0.0]
|
||||
batch_counts = [1, ]
|
||||
else: # universal
|
||||
modes = [cutlass.gemm.Mode.Gemm]
|
||||
batch_counts = [1, 2, 3, 5, 7]
|
||||
if operation.arch < 90:
|
||||
# Split K kernels via Python are currently only supported pre-SM90
|
||||
modes.append(cutlass.gemm.Mode.GemmSplitKParallel)
|
||||
|
||||
problem_size_m = [alignment_m, 512 - 3 * alignment_m]
|
||||
problem_size_n = [alignment_n, 512 - 2 * alignment_n]
|
||||
if operation.tile_description.stages is None:
|
||||
stages_for_k_calc = 7
|
||||
else:
|
||||
stages_for_k_calc = operation.tile_description.stages
|
||||
problem_size_k = [
|
||||
alignment_k,
|
||||
threadblock_k * stages_for_k_calc - alignment_k,
|
||||
threadblock_k * stages_for_k_calc * 3 - alignment_k]
|
||||
problem_alpha = [1.0]
|
||||
problem_beta = [2.0]
|
||||
|
||||
testbed = GemmUniversalLauncher(
|
||||
operation, interleaved=(testcase == "interleaved"))
|
||||
|
||||
for mode in modes:
|
||||
for m in problem_size_m:
|
||||
for n in problem_size_n:
|
||||
for k in problem_size_k:
|
||||
for batch_count in batch_counts:
|
||||
for alpha in problem_alpha:
|
||||
for beta in problem_beta:
|
||||
# skip very small K problems
|
||||
if testcase == "universal":
|
||||
if (k // batch_count < 2 * threadblock_k):
|
||||
continue
|
||||
|
||||
problem_size = cutlass.gemm.GemmCoord(m, n, k)
|
||||
|
||||
if operation.arch < 90:
|
||||
split_k_slices = batch_count
|
||||
else:
|
||||
split_k_slices = 1
|
||||
|
||||
overridden_mode = mode
|
||||
if mode == cutlass.gemm.Mode.Gemm and batch_count > 1:
|
||||
overridden_mode = cutlass.gemm.Mode.Batched
|
||||
|
||||
passed = testbed.run(
|
||||
overridden_mode, problem_size, batch_count, split_k_slices, alpha, beta)
|
||||
|
||||
err, = cudart.cudaDeviceSynchronize()
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError(
|
||||
"CUDA Error %s" % str(err))
|
||||
|
||||
if not passed:
|
||||
return False
|
||||
|
||||
return passed
|
||||
@ -1,70 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
from cuda import cuda
|
||||
from cuda import cudart
|
||||
|
||||
|
||||
class GpuTimer:
|
||||
def __init__(self) -> None:
|
||||
self.events = [
|
||||
cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
|
||||
cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1]
|
||||
]
|
||||
|
||||
def start(self, stream=cuda.CUstream(0)):
|
||||
err, = cuda.cuEventRecord(self.events[0], stream)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError("CUDA Error %s" % str(err))
|
||||
|
||||
def stop(self, stream=cuda.CUstream(0)):
|
||||
err, = cuda.cuEventRecord(self.events[1], stream)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError("CUDA Error %s" % str(err))
|
||||
pass
|
||||
|
||||
def stop_and_wait(self, stream=cuda.CUstream(0)):
|
||||
self.stop(stream)
|
||||
if stream:
|
||||
err, = cuda.cuStreamSynchronize(stream)
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError("CUDA Error %s" % str(err))
|
||||
else:
|
||||
err, = cudart.cudaDeviceSynchronize()
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError("CUDA Error %s" % str(err))
|
||||
|
||||
def duration(self, iterations=1):
|
||||
err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
|
||||
if err != cuda.CUresult.CUDA_SUCCESS:
|
||||
raise RuntimeError("CUDA Error %s" % str(err))
|
||||
return duration / float(iterations)
|
||||
@ -1,109 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import cutlass
|
||||
from pycutlass import library, SubstituteTemplate
|
||||
|
||||
|
||||
class Layout:
|
||||
"""
|
||||
Utility class to map transpose and non-transpose terminology to row- and column-major terminology
|
||||
"""
|
||||
T = cutlass.RowMajor
|
||||
N = cutlass.ColumnMajor
|
||||
|
||||
|
||||
class LayoutCombination:
|
||||
"""
|
||||
Utility class defining all combinations of row- and column-major layouts for operands to a GEMMs
|
||||
"""
|
||||
NNN = (Layout.N, Layout.N, Layout.N)
|
||||
NNT = (Layout.N, Layout.N, Layout.T)
|
||||
NTN = (Layout.N, Layout.T, Layout.N)
|
||||
NTT = (Layout.N, Layout.T, Layout.T)
|
||||
TNN = (Layout.T, Layout.N, Layout.N)
|
||||
TNT = (Layout.T, Layout.N, Layout.T)
|
||||
TTN = (Layout.T, Layout.T, Layout.N)
|
||||
TTT = (Layout.T, Layout.T, Layout.T)
|
||||
|
||||
|
||||
def get_name(layouts, alignments, element_output,
|
||||
element_accumulator, element_epilogue, cluster_shape,
|
||||
threadblock_shape, stages, element_a, element_b, arch, opclass, suffix=""):
|
||||
"""
|
||||
Generates a procedural name for a test case.
|
||||
|
||||
:param layouts: indexable container of layouts of A, B, and C operands
|
||||
:param alignments: indexable container of alignments of A, B, and C operands
|
||||
:param element_output: data type of the output element
|
||||
:param element_accumulator: data type used in accumulation
|
||||
:param element_epilogue: data type used in computing the epilogue
|
||||
:param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
|
||||
:param threadblock_shape: indexable container of dimensions of threadblock tiles
|
||||
:param stages: number of pipeline stages to use in the kernel
|
||||
:type stages: int
|
||||
:param element_a: data type of operand A
|
||||
:param element_b: data type of operand B
|
||||
:param arch: compute capability of kernel being generated
|
||||
:type arch: int
|
||||
:param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
|
||||
:type opclass: cutlass.OpClass
|
||||
:param suffix: additional string to add to the suffix of the name
|
||||
:type suffix: str
|
||||
|
||||
:return: str
|
||||
"""
|
||||
name_format = 'test_SM${arch}_Device_Gemm_${eA}${lA}_${eB}${lB}_${eC}${lC}_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${cM}x${cN}x${cK}_${stages}_align${aA}-${aB}-${aC}${suffix}'
|
||||
return SubstituteTemplate(name_format,
|
||||
{
|
||||
'arch': str(arch),
|
||||
'eA': library.DataTypeNames[element_a],
|
||||
'eB': library.DataTypeNames[element_b],
|
||||
'eC': library.DataTypeNames[element_output],
|
||||
'lA': library.ShortLayoutTypeNames[layouts[0]],
|
||||
'lB': library.ShortLayoutTypeNames[layouts[1]],
|
||||
'lC': library.ShortLayoutTypeNames[layouts[2]],
|
||||
'opclass': library.OpcodeClassNames[opclass],
|
||||
'acc': library.DataTypeNames[element_accumulator],
|
||||
'cM': str(cluster_shape[0]),
|
||||
'cN': str(cluster_shape[1]),
|
||||
'cK': str(cluster_shape[2]),
|
||||
'tbM': str(threadblock_shape[0]),
|
||||
'tbN': str(threadblock_shape[1]),
|
||||
'tbK': str(threadblock_shape[2]),
|
||||
'stages': str(stages) if stages is not None else 'auto',
|
||||
'aA' : str(alignments[0]),
|
||||
'aB' : str(alignments[1]),
|
||||
'aC' : str(alignments[2]),
|
||||
'suffix': '' if suffix is None else suffix
|
||||
}
|
||||
)
|
||||
@ -1,39 +0,0 @@
|
||||
################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
from typing import Union
|
||||
from typeguard import typechecked
|
||||
|
||||
|
||||
GemmOperation = 'Union[GemmOperationUniversal, GemmOperationGrouped]'
|
||||
|
||||
Tensor = 'Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]'
|
||||
@ -1 +0,0 @@
|
||||
from pycutlass.utils.reference_model import *
|
||||
@ -1,121 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Utility functions for converting between frontend datatypes and CUTLASS datatypes
|
||||
"""
|
||||
|
||||
from typing import Union, Tuple
|
||||
|
||||
import cutlass
|
||||
|
||||
import pycutlass.library as library
|
||||
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
numpy_available = True
|
||||
except ImportError:
|
||||
numpy_available = False
|
||||
|
||||
def numpy_to_cutlass(inp):
|
||||
if numpy_available:
|
||||
if inp == np.float16:
|
||||
return cutlass.float16
|
||||
elif inp == np.float32:
|
||||
return cutlass.float32
|
||||
elif inp == np.float64:
|
||||
return cutlass.float64
|
||||
elif inp == np.int8:
|
||||
return cutlass.int8
|
||||
elif inp == np.int32:
|
||||
return cutlass.int32
|
||||
return None
|
||||
|
||||
try:
|
||||
import cupy as cp
|
||||
cupy_available = True
|
||||
cupy_to_cutlass_dict = {
|
||||
cp.float16: cutlass.float16,
|
||||
cp.float32: cutlass.float32,
|
||||
cp.float64: cutlass.float64
|
||||
}
|
||||
except ImportError:
|
||||
cupy_available = False
|
||||
|
||||
def cupy_to_cutlass(inp):
|
||||
if cupy_available:
|
||||
if inp == cp.float16:
|
||||
return cutlass.float16
|
||||
elif inp == cp.float32:
|
||||
return cutlass.float32
|
||||
elif inp == cp.float64:
|
||||
return cutlass.float64
|
||||
return None
|
||||
|
||||
try:
|
||||
import torch
|
||||
torch_available = True
|
||||
torch_to_cutlass_dict = {
|
||||
torch.half: cutlass.float16,
|
||||
torch.float16: cutlass.float16,
|
||||
torch.float: cutlass.float32,
|
||||
torch.float32: cutlass.float32,
|
||||
torch.double: cutlass.float64,
|
||||
torch.float64: cutlass.float64
|
||||
}
|
||||
except ImportError:
|
||||
torch_available = False
|
||||
|
||||
def torch_to_cutlass(inp):
|
||||
if torch_available:
|
||||
return torch_to_cutlass_dict.get(inp, None)
|
||||
|
||||
try:
|
||||
import bfloat16
|
||||
bfloat16_available = True
|
||||
except ImportError:
|
||||
bfloat16_available = False
|
||||
|
||||
def bfloat16_to_cutlass(inp):
|
||||
if bfloat16_available:
|
||||
if inp == bfloat16.bfloat16:
|
||||
return cutlass.bfloat16
|
||||
|
||||
|
||||
def to_cutlass(inp):
|
||||
for cvt_fn in [bfloat16_to_cutlass, cupy_to_cutlass, numpy_to_cutlass, torch_to_cutlass]:
|
||||
out = cvt_fn(inp)
|
||||
if out is not None:
|
||||
return out
|
||||
|
||||
raise Exception('No available conversion from type {} to a CUTLASS type.'.format(inp))
|
||||
@ -1,76 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
"""
|
||||
Utility functions for interacting with the device
|
||||
"""
|
||||
|
||||
from cuda import cudart
|
||||
|
||||
|
||||
def check_cuda_errors(result: list):
|
||||
"""
|
||||
Checks whether `result` contains a CUDA error raises the error as an exception, if so. Otherwise,
|
||||
returns the result contained in the remaining fields of `result`.
|
||||
|
||||
:param result: the results of the `cudart` method, consisting of an error code and any method results
|
||||
:type result: list
|
||||
|
||||
:return: non-error-code results from the `results` parameter
|
||||
"""
|
||||
# `result` is of the format : (cudaError_t, result...)
|
||||
err = result[0]
|
||||
if err.value:
|
||||
raise RuntimeError("CUDA error: {}".format(cudart.cudaGetErrorName(err)))
|
||||
|
||||
if len(result) == 1:
|
||||
return None
|
||||
elif len(result) == 2:
|
||||
return result[1]
|
||||
else:
|
||||
return result[1:]
|
||||
|
||||
|
||||
def device_cc(device: int = 0) -> int:
|
||||
"""
|
||||
Returns the compute capability of the device with ID `device`.
|
||||
|
||||
:param device: ID of the device to query
|
||||
:type device: int
|
||||
|
||||
:return: compute capability of the queried device (e.g., 80 for SM80)
|
||||
:rtype: int
|
||||
"""
|
||||
deviceProp = check_cuda_errors(cudart.cudaGetDeviceProperties(device))
|
||||
major = str(deviceProp.major)
|
||||
minor = str(deviceProp.minor)
|
||||
return int(major + minor)
|
||||
@ -1,255 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
import numpy as np
|
||||
import cutlass
|
||||
from pycutlass.library import TensorDescription
|
||||
from typing import Union
|
||||
from bfloat16 import bfloat16
|
||||
try:
|
||||
import torch
|
||||
torch_available = True
|
||||
except ImportError:
|
||||
torch_available = False
|
||||
|
||||
class ReferenceModule:
|
||||
def __init__(self, A: TensorDescription, B: TensorDescription, C: TensorDescription) -> None:
|
||||
self.layout_A = A.layout
|
||||
self.layout_B = B.layout
|
||||
self.layout_C = C.layout
|
||||
|
||||
def run(self, A: np.ndarray, B: np.ndarray, C: np.ndarray, problem_size: cutlass.gemm.GemmCoord, alpha: float=1.0, beta: float=0.0, bias=False, batch=1):
|
||||
"""
|
||||
Compute the reference result on CPU
|
||||
Args:
|
||||
A: dense operator with shape (M, K) in row-major and (K, M) in column-major
|
||||
B: dense operator with shape (K, N) in row-major and (N, K) in column-major
|
||||
C: dense operator with shape (M, N) in row-major and (N, M) in column-major
|
||||
"""
|
||||
M, N, K = problem_size.m(), problem_size.n(), problem_size.k()
|
||||
if isinstance(A, np.ndarray):
|
||||
if self.layout_A == cutlass.RowMajor:
|
||||
A_row = np.reshape(A, newshape=(batch, M, K))
|
||||
else:
|
||||
A_col = np.reshape(A, newshape=(batch, K, M))
|
||||
A_row = np.transpose(A_col, axes=(0, 2, 1))
|
||||
|
||||
if self.layout_B == cutlass.RowMajor:
|
||||
B_row = np.reshape(B, newshape=(batch, K, N))
|
||||
else:
|
||||
B_col = np.reshape(B, newshape=(batch, N, K))
|
||||
B_row = np.transpose(B_col, axes=(0, 2, 1))
|
||||
|
||||
if self.layout_C == cutlass.RowMajor:
|
||||
if bias:
|
||||
C_row = np.reshape(C, newshape=(batch, 1, N))
|
||||
else:
|
||||
C_row = np.reshape(C, newshape=(batch, M, N))
|
||||
else:
|
||||
if bias:
|
||||
C_row = np.reshape(C, newshape=(batch, M, 1))
|
||||
else:
|
||||
C_col = np.reshape(C, newshape=(batch, N, M))
|
||||
C_row = np.transpose(C_col, axes=(0, 2, 1))
|
||||
|
||||
if A_row.dtype == bfloat16:
|
||||
# numpy's einsum doesn't support bfloat16
|
||||
out_row = np.einsum("bik,bkj->bij", A_row.astype(np.float32), B_row.astype(np.float32)) * alpha + C_row * beta
|
||||
out_row = out_row.astype(C_row.dtype)
|
||||
else:
|
||||
out_row = np.einsum("bik,bkj->bij", A_row, B_row) * alpha + C_row * beta
|
||||
|
||||
if self.layout_C == cutlass.ColumnMajor:
|
||||
out = np.transpose(out_row, axes=(0, 2, 1))
|
||||
else:
|
||||
out = out_row
|
||||
|
||||
return out.ravel()
|
||||
|
||||
elif isinstance(A, torch.Tensor):
|
||||
if self.layout_A == cutlass.RowMajor:
|
||||
A_row = A.view((M, K))
|
||||
else:
|
||||
A_col = A.view((K, M))
|
||||
A_row = torch.permute(A_col, (1, 0))
|
||||
|
||||
if self.layout_B == cutlass.RowMajor:
|
||||
B_row = B.view((K, N))
|
||||
else:
|
||||
B_col = B.view((N, K))
|
||||
B_row = torch.permute(B_col, (1, 0))
|
||||
|
||||
if self.layout_C == cutlass.RowMajor:
|
||||
C_row = C.view((M, N))
|
||||
else:
|
||||
C_col = C.view((N, M))
|
||||
C_row = torch.permute(C_col, (1, 0))
|
||||
|
||||
out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
|
||||
|
||||
if self.layout_C == cutlass.ColumnMajor:
|
||||
out = torch.permute(out_row, (1, 0))
|
||||
else:
|
||||
out = out_row
|
||||
|
||||
return torch.flatten(out)
|
||||
|
||||
|
||||
|
||||
#####################################################################################################
|
||||
# Conv2d
|
||||
#####################################################################################################
|
||||
|
||||
if torch_available:
|
||||
class Conv2dReferenceModule:
|
||||
def __init__(self, A: TensorDescription, B: TensorDescription, C: TensorDescription, kind: cutlass.conv.Operator.fprop) -> None:
|
||||
self.layout_A = A.layout
|
||||
self.layout_B = B.layout
|
||||
self.layout_C = C.layout
|
||||
self.kind = kind
|
||||
|
||||
def run(self,
|
||||
A: Union[np.ndarray, torch.Tensor],
|
||||
B: Union[np.ndarray, torch.Tensor],
|
||||
C: Union[np.ndarray, torch.Tensor], problem_size, alpha=1.0, beta=0.0, bias=False) -> np.ndarray:
|
||||
"""
|
||||
Compute the reference result on CPU
|
||||
"""
|
||||
n = problem_size.N
|
||||
h = problem_size.H
|
||||
w = problem_size.W
|
||||
c = problem_size.C
|
||||
|
||||
k = problem_size.K
|
||||
r = problem_size.R
|
||||
s = problem_size.S
|
||||
|
||||
p = problem_size.P
|
||||
q = problem_size.Q
|
||||
|
||||
stride_h = problem_size.stride_h
|
||||
stride_w = problem_size.stride_w
|
||||
|
||||
pad_h = problem_size.pad_h
|
||||
pad_w = problem_size.pad_w
|
||||
|
||||
dilation_h = problem_size.dilation_h
|
||||
dilation_w = problem_size.dilation_w
|
||||
|
||||
groups = problem_size.groups
|
||||
|
||||
if isinstance(A, np.ndarray):
|
||||
# the pytorch activation layout is NCHW
|
||||
# weight layout is Cout Cin Kh Kw (also NCHW)
|
||||
if self.layout_A == cutlass.TensorNHWC:
|
||||
A_nhwc = np.reshape(A, newshape=(n, h, w, c))
|
||||
A_torch_nhwc = torch.from_numpy(A_nhwc).to("cuda")
|
||||
A_torch_nchw = torch.permute(A_torch_nhwc, (0, 3, 1, 2))
|
||||
|
||||
if self.layout_B == cutlass.TensorNHWC:
|
||||
B_nhwc = np.reshape(B, newshape=(k, r, s, c))
|
||||
B_torch_nhwc = torch.from_numpy(B_nhwc).to("cuda")
|
||||
B_torch_nchw = torch.permute(B_torch_nhwc, (0, 3, 1, 2))
|
||||
|
||||
if self.layout_C == cutlass.TensorNHWC:
|
||||
C_nhwc = np.reshape(C, newshape=(n, p, q, k))
|
||||
C_torch_nhwc = torch.from_numpy(C_nhwc).to("cuda")
|
||||
C_torch_nchw = torch.permute(C_torch_nhwc, (0, 3, 1, 2))
|
||||
|
||||
elif isinstance(A, torch.Tensor):
|
||||
if self.kind == cutlass.conv.Operator.wgrad:
|
||||
if self.layout_A == cutlass.TensorNHWC:
|
||||
A_nhwc = A.view((n, p, q, k))
|
||||
A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
|
||||
|
||||
if self.layout_B == cutlass.TensorNHWC:
|
||||
B_nhwc = B.view((n, h, w, c))
|
||||
B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
|
||||
|
||||
if self.layout_C == cutlass.TensorNHWC:
|
||||
if bias:
|
||||
C_nhwc = C.view((1, 1, 1, c))
|
||||
else:
|
||||
C_nhwc = C.view((k, r, s, c))
|
||||
C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
|
||||
elif self.kind == cutlass.conv.Operator.dgrad:
|
||||
if self.layout_A == cutlass.TensorNHWC:
|
||||
A_nhwc = A.view((n, p, q, k))
|
||||
A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
|
||||
|
||||
if self.layout_B == cutlass.TensorNHWC:
|
||||
B_nhwc = B.view((k, r, s, c))
|
||||
B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
|
||||
|
||||
if self.layout_C == cutlass.TensorNHWC:
|
||||
if bias:
|
||||
C_nhwc = C.view((1, 1, 1, c))
|
||||
else:
|
||||
C_nhwc = C.view((n, h, w, c))
|
||||
C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
|
||||
else:
|
||||
if self.layout_A == cutlass.TensorNHWC:
|
||||
A_nhwc = A.view((n, h, w, c))
|
||||
A_torch_nchw = torch.permute(A_nhwc, (0, 3, 1, 2))
|
||||
|
||||
if self.layout_B == cutlass.TensorNHWC:
|
||||
B_nhwc = B.view((k, r, s, c))
|
||||
B_torch_nchw = torch.permute(B_nhwc, (0, 3, 1, 2))
|
||||
|
||||
if self.layout_C == cutlass.TensorNHWC:
|
||||
if bias:
|
||||
C_nhwc = C.view((1, 1, 1, k))
|
||||
else:
|
||||
C_nhwc = C.view((n, p, q, k))
|
||||
C_torch_nchw = torch.permute(C_nhwc, (0, 3, 1, 2))
|
||||
|
||||
if self.kind == cutlass.conv.Operator.fprop:
|
||||
D_torch_nchw = alpha * torch.nn.functional.conv2d(
|
||||
A_torch_nchw, B_torch_nchw, stride=(stride_h, stride_w),
|
||||
padding=(pad_h, pad_w), dilation=(dilation_h, dilation_w), groups=groups) + beta * C_torch_nchw
|
||||
elif self.kind == cutlass.conv.Operator.dgrad:
|
||||
D_torch_nchw = alpha * torch.nn.grad.conv2d_input(
|
||||
(n, c, h, w), B_torch_nchw, A_torch_nchw, padding=(pad_h, pad_w), stride=(stride_h, stride_w)
|
||||
).to(torch.float32) + beta * C_torch_nchw
|
||||
elif self.kind == cutlass.conv.Operator.wgrad:
|
||||
D_torch_nchw = alpha * torch.nn.grad.conv2d_weight(
|
||||
B_torch_nchw, (k, c, r, s), A_torch_nchw, padding=(pad_h, pad_w), stride=(stride_h, stride_w)
|
||||
).to(torch.float32) + beta * C_torch_nchw
|
||||
|
||||
|
||||
if self.layout_C == cutlass.TensorNHWC:
|
||||
if isinstance(A, np.ndarray):
|
||||
D_torch_out = torch.permute(D_torch_nchw, (0, 2, 3, 1)).detach().cpu().numpy()
|
||||
elif isinstance(A, torch.Tensor):
|
||||
D_torch_out = torch.permute(D_torch_nchw, (0, 2, 3, 1))
|
||||
|
||||
return D_torch_out.flatten()
|
||||
@ -1,274 +0,0 @@
|
||||
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 2104699940 3506659864 557648934
|
||||
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1539314507 3971227455 1976927351 1642148785
|
||||
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 276489656 653235219 3147305346 880610205
|
||||
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 272457724 2178229139 2786201726 4170295839
|
||||
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 242235041 2149454506 784935854 682531065
|
||||
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 3478189705 1667216236 1437761176
|
||||
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 379326961 1780379994 3740415776
|
||||
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 215903418 924848818 3533854396 2683779476
|
||||
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 359232443 2147867990 1653277018
|
||||
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2870331951 3784314846 2644315999 4224154526
|
||||
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3787448414 3562991793 535073859 2563373454
|
||||
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 426169840 2464808416 864648234 461884698
|
||||
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2564934525 3910792915 3577331017 827498183
|
||||
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 28479234 867695528 1947311971 83328334
|
||||
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4192922822 4244595864 2296602326 2349214706
|
||||
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 274678245 3464152269 1682550229 3446204619
|
||||
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3993280136 828543035 1319748516 956044554
|
||||
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 832003025 3799813757 4030292245 457791957
|
||||
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1444316594 4129865888 93616503 412257611
|
||||
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 36703874
|
||||
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2931873718 1841508064 1497852219 1842147148
|
||||
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1612565294 109894479 1782187316 3370789453
|
||||
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 841569299 1010785577 1158956167 3261208135
|
||||
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1893352157 48149942 3544807462 446577726
|
||||
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 3585320147 2150950452 1625817025 3964129474
|
||||
conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 2624928614 3423533117 3186342135
|
||||
conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 2732296888 1838622641 4203745561
|
||||
conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3456572634 893492926 1966259884
|
||||
conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 4014726279 4027869577 1510990157
|
||||
conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 4140605332 3580988556 3425909428
|
||||
conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2106553169 835800311 3417471222
|
||||
conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 860217059 166776702 1109666471
|
||||
conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 855244826 2670006594 3857976152
|
||||
conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 3079461262 3579256638 2926210806
|
||||
conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2952423142 2045838875 3445165841
|
||||
conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 2133381336 2601441527 2035094220
|
||||
conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 1700915522 2515933441 406719240
|
||||
conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 156533442 1012781676 688128904
|
||||
conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
|
||||
conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
|
||||
conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
|
||||
conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
|
||||
conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
|
||||
conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
|
||||
conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
|
||||
conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
|
||||
conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
|
||||
conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
|
||||
conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
|
||||
conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
|
||||
conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930
|
||||
conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 3612826298 2531545294 476754549
|
||||
conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 2391975923 197605094 3409942185
|
||||
conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3071904063 408984565 2378809888
|
||||
conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 3067676760 1540919649 2008865071
|
||||
conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 1085505037 2778215386 230227569
|
||||
conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2731079464 3570839563 3483629877
|
||||
conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 408419601 3415600242 2106927195
|
||||
conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 3606099389 4034802752 3200055633
|
||||
conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 3910244699 1319285699 2229775542
|
||||
conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 2780071616 2703730845 3090625734
|
||||
conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 4278696824 360883914 3802692600
|
||||
conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 653419877 359675571 283806385
|
||||
conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 1075980921 3101013494 2025203940
|
||||
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1393431534 1148212814 1350914659
|
||||
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 4283492776 419570292 1210341563
|
||||
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4178596783 3828059710 2735749436 2671012171
|
||||
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 924522595 563724475 3750778972 4152580670
|
||||
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1021044158 1686067905 3765040166 4102272733
|
||||
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 2674994719 635224486 2759329777
|
||||
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 4201252830 2920298728 304256151
|
||||
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3335547 70289262 646435722 4137562540
|
||||
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 1288095320 2132879813 656196754
|
||||
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1317457392 2202157489 2326567490 2475188414
|
||||
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2476454437 1857118302 4164386062 239840568
|
||||
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2767650699 3514840131 590439733 3879821123
|
||||
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3896287283 3112762669 2515107934 2106635937
|
||||
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 1903067870 1021832870 3003938078 2751931686
|
||||
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3489785028 2466126497 1374078692 2737628040
|
||||
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2051350923 263676708 3639860119 1370886256
|
||||
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 719099834 1474713672 204857540 2768940347
|
||||
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3441724486 3162593831 421721594 3097845598
|
||||
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2034354027 1249407570 2567025479 1441082595
|
||||
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 2369653089
|
||||
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 941893937 3608468045 635631428 1218705038
|
||||
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 172579142 319546523 718795680 1453661415
|
||||
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2823351660 1326352711 1110204809 1155441703
|
||||
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3238446487 2572503545 686287700 1559476701
|
||||
conv2d fprop_1x8x8x1_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 991402150 1883874274 1180207512 3934800419
|
||||
conv2d fprop_1x16x16x1_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 4230587034 4117433929 2540623821
|
||||
conv2d fprop_1x16x16x1_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 3802993432 1563447158 515257167
|
||||
conv2d fprop_1x224x224x1_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2583340103 3928463259 1564251818
|
||||
conv2d fprop_1x224x224x1_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 2966178620 3457283045 1726663817
|
||||
conv2d fprop_1x224x224x1_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 3101289788 3492498648
|
||||
conv2d fprop_1x224x224x1_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 7656882 1794561978 498358130 4111289929
|
||||
conv2d fprop_1x8x8x2_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2693144988 3876248534 3038023830 1910263513
|
||||
conv2d fprop_1x16x16x2_8x8_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3355193355 319259163 535683577
|
||||
conv2d fprop_1x16x16x2_12x12_16x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 1548147432 3385829172 2741952709
|
||||
conv2d fprop_1x224x224x2_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 2686562907 3948710179 3669872932
|
||||
conv2d fprop_1x224x224x2_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 576815792 2317227037 1211532666
|
||||
conv2d fprop_1x224x224x2_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 555460201 895685163
|
||||
conv2d fprop_1x224x224x2_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3479872296 27596985 1465341652 2228916523
|
||||
conv2d fprop_1x8x8x4_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 24290453 137535877 1436667267 1395660627
|
||||
conv2d fprop_1x224x224x4_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 2226159049 4051661898 209529384
|
||||
conv2d fprop_1x224x224x4_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 3541851870 2271016226 2671623385
|
||||
conv2d fprop_1x224x224x4_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 2007343215 3362992769
|
||||
conv2d fprop_1x224x224x4_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 2495921302 982184919 20610297 1086800078
|
||||
conv2d fprop_1x8x8x8_4x4_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 4208297221 3117444553 1497663382 3561001103
|
||||
conv2d fprop_1x224x224x8_220x220_32x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 1414143072 827338392 2827855918
|
||||
conv2d fprop_1x224x224x8_110x110_64x7x7_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 3886996022 26545788 3407771964
|
||||
conv2d fprop_1x224x224x8_222x222_64x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 2374613655 3601677176
|
||||
conv2d fprop_1x224x224x8_111x111_64x5x5_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_f_f 3188907679 380272816 778374730 2110111988
|
||||
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1736512560 49406874 846358010 3314905564
|
||||
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1848484956 1432417472 1903569827 3750799351
|
||||
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4236427320 3696009469 69852620 201921851
|
||||
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 109006944 450017448 1793784844 903209915
|
||||
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 813367872 2397796503 1928191746 3210229460
|
||||
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1307184141 46021356 1674017987
|
||||
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 1212511562 3331767121 2446286369
|
||||
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1348284291 2013675943 1681111033 1469213228
|
||||
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 500298386 3218034344 4159283207
|
||||
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1703349794 1123534155 145385311 4273847179
|
||||
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3862659311 349459322 1503631520 1404971956
|
||||
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1623686755 961217371 552550209 3980749384
|
||||
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3554927580 1131648083 4149599295 3119557776
|
||||
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1767639287 3350675774 128324027 1059816532
|
||||
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3986143536 17411088 40173029 1694092310
|
||||
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1157793540 3513299281 48848814 1435528367
|
||||
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 988962069 4292634763 388976034 2674929544
|
||||
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4202383208 3529769234 1046186503 3368902675
|
||||
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 856448884 3057259762 2063087558 1995545427
|
||||
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 400986166
|
||||
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2281940872 144496548 2455451862 1082696406
|
||||
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2702905851 1992889713 731289041 608504198
|
||||
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2742293143 4197915274 606840 3671124731
|
||||
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 149434841 2288560511 2994968424 2881838300
|
||||
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 2226824643 327135318 3718671210 2121176659
|
||||
conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3254575292 1119957081 672831271
|
||||
conv2d fprop_1x4x4x14_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3115523958 3622905002 4020453928 3853387318
|
||||
conv2d fprop_1x23x56x98_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1702870033 1876930844 1190400523 3937287850
|
||||
conv2d fprop_1x4x4x28_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 2587856937 2021107274 2789519899
|
||||
conv2d fprop_1x23x56x100_10x22_128x3x3_pad_h4w5_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2368669977 1353376771 744357395 786349633
|
||||
conv2d fprop_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 991402150 1393431534 2496492611 3901723984
|
||||
conv2d fprop_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4208297221 4283492776 3148637036 258220505
|
||||
conv2d fprop_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4178596783 3828059710 281106520 1103939403
|
||||
conv2d fprop_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 924522595 563724475 1938163814 2197809394
|
||||
conv2d fprop_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1021044158 1686067905 350851834 3999808950
|
||||
conv2d fprop_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 2674994719 1034822169 1611033520
|
||||
conv2d fprop_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 4201252830 1597212204 2181492560
|
||||
conv2d fprop_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3335547 70289262 3001492060 1379239000
|
||||
conv2d fprop_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 1288095320 4211138051 2804617605
|
||||
conv2d fprop_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1317457392 2202157489 1043108884 2923122465
|
||||
conv2d fprop_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2476454437 1857118302 3877008798 1206012078
|
||||
conv2d fprop_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2767650699 3514840131 2946529611 3907056932
|
||||
conv2d fprop_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3896287283 3112762669 1581171257 3959460786
|
||||
conv2d fprop_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1903067870 1021832870 1926804094 1756790353
|
||||
conv2d fprop_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3489785028 2466126497 1712378956 434322965
|
||||
conv2d fprop_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2051350923 263676708 355203300 821870356
|
||||
conv2d fprop_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 719099834 1474713672 2886387159 4086314983
|
||||
conv2d fprop_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3441724486 3162593831 1422796372 2049419539
|
||||
conv2d fprop_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2034354027 1249407570 1196036582 2684312264
|
||||
conv2d fprop_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 1060050551
|
||||
conv2d fprop_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 941893937 3608468045 2198911423 3361618746
|
||||
conv2d fprop_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 172579142 319546523 2332616929 543467298
|
||||
conv2d fprop_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2823351660 1326352711 3839068434 65031397
|
||||
conv2d fprop_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3238446487 2572503545 3604065639 2111204111
|
||||
conv2d fprop_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 2149247508 1775375365 2663631601 1249487679
|
||||
conv2d fprop_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 403997062 1679063623 4062928786
|
||||
conv2d dgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 1623218578 436154205
|
||||
conv2d dgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1479940693 3253144559 3883419107
|
||||
conv2d dgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 1871463331 2425320272 74566211
|
||||
conv2d dgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 664160900 3610888033 22347127
|
||||
conv2d dgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1924855848 1382111427 2541177413
|
||||
conv2d dgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 1764715518 3070473696 2392864704
|
||||
conv2d dgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 666906244 3401957738 2050602745
|
||||
conv2d dgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1575210381 781892324 2848949054
|
||||
conv2d dgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 2316839359 1539389419 4293781748
|
||||
conv2d dgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 2693098375
|
||||
conv2d dgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 2469024119 2885305868 1969608051
|
||||
conv2d dgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 554790212 2885143346 780489333
|
||||
conv2d dgrad_1x56x56x8_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 835105643 3337423971 3866137775
|
||||
conv2d dgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1092015789 3160693693 1526395881
|
||||
conv2d dgrad_1x56x56x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 2236679600 3168985259
|
||||
conv2d dgrad_1x55x55x12_28x28_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3184127693 1941683430 3784328837 471971363
|
||||
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 289918791 1266976707 942688231 3457364823
|
||||
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 1027662440 2005082293 2235558527
|
||||
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 3380032042 1370040310 1348846927
|
||||
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 671982235 1423304149 2107662762 1234913781
|
||||
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 798317794 1709026638 2421185623 3308071321
|
||||
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1721270411 2519327328 2541413264 3185574975
|
||||
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2070174510 1364436192 3531942595
|
||||
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2128738105 2056902987 3079166829 2329433528
|
||||
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3227877956 645422556
|
||||
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 672413387 3857917762 3817218800 985231315
|
||||
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2754803027 1398036015 3630062764 2492522537
|
||||
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2784049299 643733019 3649549642 2637869234
|
||||
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2756413475 2332160299 302086821 3303132343
|
||||
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1931093565 2458714707 2919710256 2311575036
|
||||
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2472246681 2260022344 500095455 2760458995
|
||||
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1530672622 3635363851 2402907878 4131497953
|
||||
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 1500864134 2536338700 2459524764 2504484273
|
||||
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 3344871528 2667385029 2714805835 3487838445
|
||||
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 966721255 1547169349 3198573835 302049294
|
||||
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 1317923157
|
||||
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2643693957 2440004820 1576818970 3186679687
|
||||
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 4028893260 4220759192 2236533218 3731336532
|
||||
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 2956871200 1591352238 1756650151 1262787222
|
||||
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_hnhwc_h_h 365467186 892422645 1334708242 1372556938
|
||||
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_hnhwc_h_h 3347784734 150035460 2897171548 3701081496
|
||||
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 4106152802 2634710231 744755886
|
||||
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 3464637181 2709881923 2407415563
|
||||
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 3723472741 3733128758 3129111191
|
||||
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 2042513140 253288229 404121198
|
||||
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1116254439 525487530 3284739065
|
||||
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1743485155 91136873 2508716910
|
||||
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 386662952 1127709182 4026285141
|
||||
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 3954249564 2591894666 2655687700
|
||||
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1263618595 1313664339
|
||||
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 1300426008 1756414462 2995557277
|
||||
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 447261065 121940906 1497499264
|
||||
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3484040069 2966693627 1423016429 341928547
|
||||
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1759979610 2761559427 68093525
|
||||
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 2980501720 1650970502 3258883197
|
||||
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 3502822733 3985958544 2568949300
|
||||
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 868180534 3289288595 385631111 328914986
|
||||
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3437976747 3391080565 1513955316 1521294163
|
||||
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4195072693 1669352457 2608107448 4284090805
|
||||
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3457330201 1126870455 106232038 3054809396
|
||||
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 4239438967
|
||||
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 754609939 1723074453 1186911503 2113601884
|
||||
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1690216859 2413490039 36034283 1112346965
|
||||
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 1601750164 14375779 2894970748
|
||||
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 1300976652 4259930640 305685205
|
||||
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 1747587481 4137156526 1174257270
|
||||
conv2d wgrad_1x4x4x12_1x1_8x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2956180805 1086820986 1644914756 2013471312
|
||||
conv2d wgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2056905385 447674669 724481645 1457430910
|
||||
conv2d wgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 1227883689 3401425854 3897766524
|
||||
conv2d wgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 3749787834 3350064812 1136116240
|
||||
conv2d wgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3414629540 820341033 770836461 2451581199
|
||||
conv2d wgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4100326666 2581696511 1088458082 1521190911
|
||||
conv2d wgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3662895757 2885454895 935600441 2615245898
|
||||
conv2d wgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 3831334389 3506139121 814982501
|
||||
conv2d wgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2154102133 737968461 1291834254 2665225480
|
||||
conv2d wgrad_1x11x7x64_6x4_8x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 1809195644 1765637461
|
||||
conv2d wgrad_1x11x7x64_6x4_8x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 4120764770 3573498719 3379808294 483095299
|
||||
conv2d wgrad_1x13x11x64_8x7_8x1x1_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1972540475 4194153035 2863868771 1639389008
|
||||
conv2d wgrad_1x17x19x64_9x10_16x2x2_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2624318208 157618421 1779474147 814087242
|
||||
conv2d wgrad_1x23x5x64_12x3_16x3x3_pad_h1w1_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2044596379 2300180628 423968553 3890279569
|
||||
conv2d wgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2609259399 1848932917 522753581 1926508271
|
||||
conv2d wgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2948772873 3663040534 4014266327 1288646188
|
||||
conv2d wgrad_1x55x51x256_28x26_512x1x1_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3271403719 1585195072 1487505772 3253374264
|
||||
conv2d wgrad_1x27x23x256_9x7_512x3x3_pad_h0w0_stride_h3w3_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1419588777 451194147 3578359696 3659768981
|
||||
conv2d wgrad_1x27x31x256_12x11_512x3x3_pad_h5w7_stride_h3w4_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 763924990 2780826684 2883769406 148530958
|
||||
conv2d wgrad_1x27x35x256_15x9_512x7x5_pad_h11w7_stride_h3w5_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2578426561 3849874822 102765469 1305171059
|
||||
conv2d wgrad_1x27x27x256_27x14_512x3x3_pad_h1w1_stride_h1w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1516344656
|
||||
conv2d wgrad_1x27x27x256_14x27_512x3x3_pad_h1w1_stride_h2w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 740110603 1995451256 2632815435 1586331550
|
||||
conv2d wgrad_3x28x28x256_14x14_256x2x2_pad_h0w0_stride_h2w2_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 2462511240 2274021368 1188866747 3178890497
|
||||
conv2d wgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 752289976 1226457131 4187777346 1400559240
|
||||
conv2d wgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 3723912751 1585959358 3731079159 1498901684
|
||||
conv2d wgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 fnhwc_fnhwc_fnhwc_f_f 2027599472 2758666204 3287095476 4291916486
|
||||
conv2d wgrad_1x8x8x1_8x8_1x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 fnhwc_fnhwc_fnhwc_f_f 1767700736 4278264698 2331753571 2554564568
|
||||
conv2d dgrad_1x1x1x64_3x3_8x1x1_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 927718585 3117803557 1370701307 1462167731
|
||||
conv2d dgrad_1x1x8x64_3x8_8x1x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4110991321 973422497 1926250028 3440543762
|
||||
conv2d dgrad_1x7x8x64_7x8_8x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 832653836 2892862516 3649300762 1521470286
|
||||
conv2d dgrad_1x7x9x64_6x8_8x4x4_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 2075083065 3181416651 1733426984 872275640
|
||||
conv2d dgrad_2x7x9x64_5x7_8x5x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 4005590448 1639170045 388151578 4186957447
|
||||
conv2d dgrad_3x7x9x64_4x7_8x6x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 181075276 1433744686 860506550 3475157408
|
||||
conv2d dgrad_3x7x9x64_4x6_8x6x6_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1513864544 1747719409 877465841 2345541783
|
||||
conv2d dgrad_3x7x9x64_3x5_8x7x7_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 856324887 2307248012 337386755 3363072703
|
||||
conv2d dgrad_1x23x21x128_23x21_224x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 1906605830 722034901 2562804622 2508759317
|
||||
conv2d dgrad_1x16x24x128_16x24_96x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 805717279 2196645331 3235235362 1518334120
|
||||
conv2d dgrad_4x4x5x128_3x3_256x3x6_pad_h0w0_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 3168796339 72559978 778918419 1260968000
|
||||
conv2d dgrad_4x2x3x256_1x1_328x3x5_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha1_beta0 hnhwc_hnhwc_fnhwc_f_f 261954979 2634885882 451986822 3792829599
|
||||
conv2d dgrad_1x17x11x288_17x11_160x3x3_pad_h1w1_stride_h1w1_dil_h1w1_corr_alpha2_beta2 hnhwc_hnhwc_fnhwc_f_f 3747142491 2426759809 2622222681 371723930
|
||||
@ -1,233 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
|
||||
from pycutlass.conv2d_operation import *
|
||||
from pycutlass import *
|
||||
from pycutlass.test import *
|
||||
from pycutlass.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dDgradImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass.MatrixCoord(3, 3),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass.MatrixCoord(3, 3),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
if __name__ == '__main__':
|
||||
pycutlass.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,209 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
|
||||
import pycutlass
|
||||
from pycutlass import *
|
||||
from pycutlass.test import *
|
||||
from pycutlass.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dDgradImplicitGemmF16nhwcF16nhwcF32nhwcTensorOpF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float32,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float32,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32], stages=4,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage3_64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float32,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_unity_stride_stage4_64(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float32,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=4,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
if __name__ == '__main__':
|
||||
pycutlass.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,130 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
|
||||
import pycutlass
|
||||
from pycutlass.conv2d_operation import *
|
||||
from pycutlass import *
|
||||
from pycutlass.test import *
|
||||
from pycutlass.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dDgradImplicitGemmF32nhwcF32nhwcF32nhwcSimtF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[1, 1, 1],
|
||||
element_a=cutlass.float32, element_b=cutlass.float32,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float32,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=1)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 8], stages=4,
|
||||
warp_count=[4, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_f32nhwc_f32nhwc_f32nhwc_simt_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[1, 1, 1],
|
||||
element_a=cutlass.float32, element_b=cutlass.float32,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.Simt,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float32,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=1)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 8], stages=4,
|
||||
warp_count=[2, 4, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pycutlass.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,127 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
|
||||
import pycutlass
|
||||
from pycutlass import *
|
||||
from pycutlass.test import *
|
||||
from pycutlass.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dDgradImplicitGemmTF32nhwcTF32nhwcTF32nhwcTensorOpF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Dgrad_Analytic_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass.float32, element_b=cutlass.float32,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float32,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 16], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Dgrad_Optimized_ImplicitGemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass.float32, element_b=cutlass.float32,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float32,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 16], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.dgrad, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Unity,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
if __name__ == '__main__':
|
||||
pycutlass.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,195 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
|
||||
import pycutlass
|
||||
from pycutlass.test import *
|
||||
from pycutlass.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
def conv2d_few_channel_problemsizes(channels):
|
||||
problem_sizes = [
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 8, 8, channels),
|
||||
cutlass.Tensor4DCoord(16, 3, 3, channels),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(2, 2),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 16, 16, channels),
|
||||
cutlass.Tensor4DCoord(16, 3, 3, channels),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(2, 2),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 16, 16, channels),
|
||||
cutlass.Tensor4DCoord(16, 7, 7, channels),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass.Tensor4DCoord(32, 7, 7, channels),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass.Tensor4DCoord(64, 7, 7, channels),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(2, 2),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass.Tensor4DCoord(64, 5, 5, channels),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass.Tensor4DCoord(64, 5, 5, channels),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(2, 2),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
return problem_sizes
|
||||
|
||||
class Conv2dFpropFewChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=2)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=2)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(2)))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Few_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_1(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 8],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=1)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=1)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 32], stages=2,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.few_channels,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, conv2d_few_channel_problemsizes(1)))
|
||||
|
||||
if __name__ == '__main__':
|
||||
pycutlass.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,219 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
|
||||
import pycutlass
|
||||
from pycutlass.test import *
|
||||
from pycutlass.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
def conv2d_fixed_channel_problemsizes(channels):
|
||||
problem_sizes = [
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 8, 8, channels),
|
||||
cutlass.Tensor4DCoord(16, 3, 3, channels),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(2, 2),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass.Tensor4DCoord(32, 7, 7, channels),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass.Tensor4DCoord(64, 7, 7, channels),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(2, 2),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass.Tensor4DCoord(64, 5, 5, channels),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 224, 224, channels),
|
||||
cutlass.Tensor4DCoord(64, 5, 5, channels),
|
||||
cutlass.Tensor4DCoord(1, 1, 1, 1),
|
||||
cutlass.MatrixCoord(2, 2),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
return problem_sizes
|
||||
|
||||
class Conv2dFpropFixedChannelsF16NHWCF16NHWCF16HNWCTensorOpF32SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_8(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(8)))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(4)))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Fixed_Channels_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_channels_2(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float32, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=2)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=2)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float32)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.fixed_channels,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, conv2d_fixed_channel_problemsizes(2)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pycutlass.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
@ -1,341 +0,0 @@
|
||||
#################################################################################################
|
||||
#
|
||||
# Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#################################################################################################
|
||||
|
||||
# test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
|
||||
import pycutlass
|
||||
from pycutlass import *
|
||||
from pycutlass.test import *
|
||||
from pycutlass.utils.device import device_cc
|
||||
import unittest
|
||||
|
||||
|
||||
@unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for SM80 tests.")
|
||||
class Conv2dFpropImplicitGemmF16nhwcF16nhwcF16nhwcTensorOpF16SM80(unittest.TestCase):
|
||||
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=2)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=2)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.analytic,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass.MatrixCoord(3, 3),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 4, 4, 14),
|
||||
cutlass.Tensor4DCoord(8, 3, 3, 14),
|
||||
cutlass.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass.MatrixCoord(3, 3),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 23, 56, 98),
|
||||
cutlass.Tensor4DCoord(128, 3, 3, 98),
|
||||
cutlass.Tensor4DCoord(4, 0, 5, 0),
|
||||
cutlass.MatrixCoord(3, 3),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Optimized_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align2(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=2)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=2)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass.MatrixCoord(3, 3),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 4, 4, 14),
|
||||
cutlass.Tensor4DCoord(8, 3, 3, 14),
|
||||
cutlass.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass.MatrixCoord(3, 3),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 23, 56, 98),
|
||||
cutlass.Tensor4DCoord(128, 3, 3, 98),
|
||||
cutlass.Tensor4DCoord(4, 0, 5, 0),
|
||||
cutlass.MatrixCoord(3, 3),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
def test_SM80_Device_Conv2d_Fprop_Analytic_ImplicitGemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_align4(self):
|
||||
math_inst = MathInstruction(
|
||||
instruction_shape=[16, 8, 16],
|
||||
element_a=cutlass.float16, element_b=cutlass.float16,
|
||||
element_accumulator=cutlass.float16, opcode_class=cutlass.OpClass.TensorOp,
|
||||
math_operation=MathOperation.multiply_add
|
||||
)
|
||||
|
||||
A = TensorDescription(
|
||||
element=math_inst.element_a,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
B = TensorDescription(
|
||||
element=math_inst.element_b,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=4)
|
||||
C = TensorDescription(
|
||||
element=cutlass.float16,
|
||||
layout=cutlass.TensorNHWC,
|
||||
alignment=8)
|
||||
|
||||
tile_description = TileDescription(
|
||||
threadblock_shape=[128, 128, 64], stages=3,
|
||||
warp_count=[2, 2, 1],
|
||||
math_instruction=math_inst
|
||||
)
|
||||
|
||||
epilogue_functor = LinearCombination(
|
||||
C.element, C.alignment,
|
||||
math_inst.element_accumulator, cutlass.float16)
|
||||
|
||||
operation = Conv2dOperation(
|
||||
conv_kind=cutlass.conv.Operator.fprop, iterator_algorithm=cutlass.conv.IteratorAlgorithm.optimized,
|
||||
arch=80, tile_description=tile_description, A=A, B=B, C=C,
|
||||
stride_support=StrideSupport.Strided,
|
||||
epilogue_functor=epilogue_functor,
|
||||
swizzling_functor=cutlass.IdentitySwizzle1
|
||||
)
|
||||
|
||||
problem_sizes = [
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 4, 4, 12),
|
||||
cutlass.Tensor4DCoord(8, 3, 3, 12),
|
||||
cutlass.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass.MatrixCoord(3, 3),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 4, 4, 28),
|
||||
cutlass.Tensor4DCoord(8, 3, 3, 28),
|
||||
cutlass.Tensor4DCoord(0, 0, 0, 0),
|
||||
cutlass.MatrixCoord(3, 3),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
cutlass.conv.Conv2dProblemSize(
|
||||
cutlass.Tensor4DCoord(1, 23, 56, 100),
|
||||
cutlass.Tensor4DCoord(128, 3, 3, 100),
|
||||
cutlass.Tensor4DCoord(4, 0, 5, 0),
|
||||
cutlass.MatrixCoord(3, 3),
|
||||
cutlass.MatrixCoord(1, 1),
|
||||
cutlass.conv.Mode.cross_correlation,
|
||||
1, 1
|
||||
),
|
||||
]
|
||||
|
||||
self.assertTrue(test_all_conv2d(operation, problem_sizes))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pycutlass.get_memory_pool(2**26, 2**26)
|
||||
unittest.main()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user