v4.0 update. (#2371)
This commit is contained in:
@ -52,7 +52,6 @@ set(header_files_to_check
|
||||
cute/swizzle_layout.hpp
|
||||
cute/tensor.hpp
|
||||
cute/tensor_impl.hpp
|
||||
cute/tensor_predicate.hpp
|
||||
cute/underscore.hpp
|
||||
# cute/algorithm
|
||||
cute/algorithm/axpby.hpp
|
||||
|
||||
@ -30,6 +30,8 @@ add_custom_target(
|
||||
cutlass_test_unit_conv_dgrad_device
|
||||
DEPENDS
|
||||
cutlass_test_unit_conv_dgrad_device_tensorop_sm90
|
||||
cutlass_test_unit_conv_dgrad_device_tensorop_sm100
|
||||
cutlass_test_unit_conv_dgrad_device_tensorop_sm100_fusion
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
@ -47,3 +49,43 @@ cutlass_test_unit_add_executable(
|
||||
sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
|
||||
)
|
||||
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
|
||||
|
||||
set(cutlass_test_unit_conv_dgrad_device_tensorop_sm100_kernels
|
||||
sm100_conv2d_dgrad_implicit_gemm_f8_f8_f8_tensorop_f32.cu
|
||||
sm100_conv2d_dgrad_implicit_gemm_f8_f8_bf16_tensorop_f32.cu
|
||||
sm100_conv2d_dgrad_implicit_gemm_f8_f8_f16_tensorop_f32.cu
|
||||
sm100_conv2d_dgrad_implicit_gemm_f8_f8_f32_tensorop_f32.cu
|
||||
|
||||
sm100_conv3d_dgrad_implicit_gemm_f8_f8_f8_tensorop_f32.cu
|
||||
sm100_conv3d_dgrad_implicit_gemm_f8_f8_bf16_tensorop_f32.cu
|
||||
sm100_conv3d_dgrad_implicit_gemm_f8_f8_f16_tensorop_f32.cu
|
||||
sm100_conv3d_dgrad_implicit_gemm_f8_f8_f32_tensorop_f32.cu
|
||||
|
||||
sm100_conv1d_dgrad_implicit_gemm_f16_f16_f16_tensorop_f16.cu
|
||||
sm100_conv2d_dgrad_implicit_gemm_f16_f16_f16_tensorop_f16.cu
|
||||
sm100_conv3d_dgrad_implicit_gemm_f16_f16_f16_tensorop_f16.cu
|
||||
|
||||
sm100_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
|
||||
sm100_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
|
||||
sm100_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
|
||||
)
|
||||
|
||||
# Add the executable
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_conv_dgrad_device_tensorop_sm100
|
||||
${cutlass_test_unit_conv_dgrad_device_tensorop_sm100_kernels}
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_conv_dgrad_device_tensorop_sm100_fusion
|
||||
|
||||
sm100_conv2d_dgrad_implicit_gemm_f8_f8_f16_tensorop_f32_with_fusion.cu
|
||||
sm100_conv3d_dgrad_implicit_gemm_f8_f8_f16_tensorop_f32_with_fusion.cu
|
||||
|
||||
sm100_conv1d_dgrad_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
|
||||
sm100_conv2d_dgrad_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
|
||||
sm100_conv3d_dgrad_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
|
||||
)
|
||||
|
||||
endif()
|
||||
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,190 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,190 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_bf16nhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::bfloat16_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_bf16nhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::bfloat16_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_bf16nhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::bfloat16_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_bf16nhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::bfloat16_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_bf16nhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::bfloat16_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_bf16nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::bfloat16_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,237 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// per-channel alpha/beta scaling && bias && relu
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta_scaled_bias_relu) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::PerColLinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f32nhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f32nhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f32nhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f32nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::float_e4m3_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::float_e4m3_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::float_e4m3_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::float_e4m3_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::float_e4m3_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::float_e4m3_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,143 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_bf16ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::bfloat16_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_bf16ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::bfloat16_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_bf16ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::bfloat16_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_bf16ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::bfloat16_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_bf16ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::bfloat16_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_bf16ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::bfloat16_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,237 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// per-channel alpha/beta scaling && bias && relu
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta_scaled_bias_relu) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::PerColLinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f32ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f32ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f32ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f8ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::float_e4m3_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f8ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::float_e4m3_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f8ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::float_e4m3_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f8ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::float_e4m3_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f8ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::float_e4m3_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f8ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::float_e4m3_t;
|
||||
using ElementFlt = cutlass::float_e4m3_t;
|
||||
using ElementOut = cutlass::float_e4m3_t;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kDgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -32,6 +32,8 @@ add_custom_target(
|
||||
cutlass_test_unit_conv1d_fprop_device_tensorop_sm90
|
||||
cutlass_test_unit_conv2d_fprop_device_tensorop_sm90
|
||||
cutlass_test_unit_conv3d_fprop_device_tensorop_sm90
|
||||
cutlass_test_unit_conv_fprop_device_tensorop_sm100
|
||||
cutlass_test_unit_conv_fprop_device_tensorop_sm100_fusion
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
@ -73,3 +75,50 @@ cutlass_test_unit_add_executable(
|
||||
sm90_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
|
||||
)
|
||||
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_conv_fprop_device_tensorop_sm100
|
||||
|
||||
# No batching of source to control compiler memory usage
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
sm100_conv1d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
|
||||
sm100_conv2d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
|
||||
sm100_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
|
||||
|
||||
sm100_conv1d_fprop_implicit_gemm_f16_f16_f16_tensorop_f16.cu
|
||||
sm100_conv2d_fprop_implicit_gemm_f16_f16_f16_tensorop_f16.cu
|
||||
sm100_conv3d_fprop_implicit_gemm_f16_f16_f16_tensorop_f16.cu
|
||||
|
||||
sm100_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
|
||||
sm100_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
|
||||
sm100_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
|
||||
|
||||
sm100_conv1d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
|
||||
sm100_conv2d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
|
||||
sm100_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_conv_fprop_device_tensorop_sm100_fusion
|
||||
|
||||
# No batching of source to control compiler memory usage
|
||||
BATCH_SOURCES ON
|
||||
BATCH_SIZE 1
|
||||
|
||||
sm100_conv1d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32_with_fusion.cu
|
||||
sm100_conv2d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32_with_fusion.cu
|
||||
sm100_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32_with_fusion.cu
|
||||
|
||||
sm100_conv1d_fprop_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
|
||||
sm100_conv2d_fprop_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
|
||||
sm100_conv3d_fprop_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
|
||||
|
||||
sm100_conv1d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32_with_fusion.cu
|
||||
sm100_conv2d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32_with_fusion.cu
|
||||
sm100_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32_with_fusion.cu
|
||||
)
|
||||
|
||||
endif()
|
||||
|
||||
@ -0,0 +1,246 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,236 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && gelu
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_gelu) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::GELU_taylor, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,292 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,339 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED))
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 128x64x64_1x1x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 128x128x64_1x2x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 256x64x64_2x1x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 256x128x64_2x2x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)
|
||||
@ -0,0 +1,378 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED))
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// per-channel alpha/beta scaling && bias && relu
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_scaled_bias_relu) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::PerColLinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && gelu
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_gelu) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::GELU_taylor, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.05f));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && gelu_erf
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_gelu_erf) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::GELU, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && swish
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_swish) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::SiLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x32
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x32_1x1x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x32
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 128x64x32_1x1x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x32
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 128x128x32_1x2x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x32
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 256x64x32_2x1x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x32
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 256x128x32_2x2x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x32
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,190 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
float, cutlass::layout::TensorNWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,237 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && gelu
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_gelu) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::GELU_taylor, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,339 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED))
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 128x64x64_1x1x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 128x128x64_1x2x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 256x64x64_2x1x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 256x128x64_2x2x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)
|
||||
@ -0,0 +1,378 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED))
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// per-channel alpha/beta scaling && bias && relu
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_scaled_bias_relu) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::PerColLinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && gelu
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_gelu) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::GELU_taylor, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && gelu_erf
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_gelu_erf) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::GELU, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && swish
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_swish) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::SiLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x32
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x32_1x1x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x32
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 128x64x32_1x1x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x32
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 128x128x32_1x2x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x32
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 256x64x32_2x1x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x32
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 256x128x32_2x2x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x32
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,190 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
float, cutlass::layout::TensorNHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,331 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && gelu
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_gelu) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::GELU_taylor, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && HardSwish
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_hardswish) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ScaledHardSwish, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && leakyrelu
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_leakyrelu) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = cutlass::half_t;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::LeakyReLU, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 128x64x64_1x1x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 128x128x64_1x2x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 256x64x64_2x1x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 256x128x64_2x2x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)
|
||||
@ -0,0 +1,473 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED))
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// per-channel alpha/beta scaling && bias && relu
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_scaled_bias_relu) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::PerColLinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && gelu
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_gelu) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::GELU_taylor, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && gelu_erf
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_gelu_erf) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::GELU, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && swish
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_swish) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::SiLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && leakyrelu
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_leakyrelu) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::LeakyReLU, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && hardswish
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_hardswish) {
|
||||
using ElementAct = int8_t;
|
||||
using ElementFlt = int8_t;
|
||||
using ElementOut = int32_t;
|
||||
using ElementAcc = int32_t;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ScaledHardSwish, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
|
||||
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x32
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x32_1x1x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x32
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 128x64x32_1x1x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x32
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 128x128x32_1x2x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x32
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 256x64x32_2x1x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x32
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 256x128x32_2x2x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x32
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,190 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta_bias) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
|
||||
ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
// alpha != 1 && beta != 0 && bias && relu
|
||||
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta_bias_relu) {
|
||||
using ElementAct = float;
|
||||
using ElementFlt = float;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using ElementBias = float;
|
||||
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
|
||||
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
float, cutlass::layout::TensorNDHWC, 4,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto,
|
||||
FusionOperation
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kFprop,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 4,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -172,6 +172,8 @@ struct ConvTestbed {
|
||||
|
||||
static constexpr bool IsBiasEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithBias<FusionOp>::value &&
|
||||
!cute::is_same_v<BiasType, void>;
|
||||
static constexpr bool IsPerChannelScaleEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithPerChannelScaled<FusionOp>::value;
|
||||
|
||||
static constexpr bool DisableSource = cute::is_void_v<typename FusionOp::ElementSource>;
|
||||
|
||||
using StrideC = typename Conv::ConvKernel::StrideC;
|
||||
@ -213,10 +215,24 @@ struct ConvTestbed {
|
||||
tensor_D_computed.resize(sizeof(ElementD) * problem_shape.size_C());
|
||||
tensor_D_reference.resize(sizeof(ElementD) * problem_shape.size_C());
|
||||
tensor_bias.resize(sizeof(ElementBias) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
|
||||
if constexpr (IsPerChannelScaleEnabled) {
|
||||
tensor_alpha.resize(sizeof(ElementScalar) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
|
||||
tensor_beta.resize(sizeof(ElementScalar) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
|
||||
}
|
||||
initialize_values(tensor_A, init_A, seed);
|
||||
initialize_values(tensor_B, init_B, seed * 11);
|
||||
initialize_values(tensor_C, init_C, seed * 17);
|
||||
initialize_values(tensor_bias, init_bias, seed * 19);
|
||||
if constexpr (IsPerChannelScaleEnabled) {
|
||||
initialize_values(tensor_alpha, init_bias, seed * 23);
|
||||
if constexpr (DisableSource) {
|
||||
initialize_values(tensor_beta, init_disable, seed * 27);
|
||||
}
|
||||
else {
|
||||
initialize_values(tensor_beta, init_bias, seed * 27);
|
||||
}
|
||||
}
|
||||
|
||||
bool flag = true;
|
||||
if constexpr (isSparseEnabled) {
|
||||
flag &= params.initialize(problem_shape, tensor_B, static_cast<int>(seed + 2023));
|
||||
@ -314,8 +330,9 @@ struct ConvTestbed {
|
||||
bool run(
|
||||
ProblemShape const& problem_shape,
|
||||
ElementScalar alpha = ElementScalar(1),
|
||||
ElementScalar beta = ElementScalar(0)
|
||||
,
|
||||
ElementScalar beta = ElementScalar(0),
|
||||
dim3 cluster_shape = dim3(0, 0, 0),
|
||||
dim3 cluster_shape_fallback = dim3(0, 0, 0),
|
||||
RasterOrderOptions raster_order = RasterOrderOptions::Heuristic,
|
||||
MaxSwizzleSize max_swizzle = MaxSwizzleSize{},
|
||||
Splits splits = Splits{},
|
||||
@ -341,6 +358,9 @@ struct ConvTestbed {
|
||||
cudaGetDevice(&hw_info.device_id);
|
||||
hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
|
||||
|
||||
hw_info.cluster_shape = cluster_shape;
|
||||
hw_info.cluster_shape_fallback = cluster_shape_fallback;
|
||||
|
||||
// configure the operator
|
||||
Conv conv_op;
|
||||
auto stride_C = StrideC{};
|
||||
@ -392,6 +412,11 @@ struct ConvTestbed {
|
||||
fusion_args.alpha = alpha;
|
||||
fusion_args.beta = beta;
|
||||
|
||||
if constexpr (IsPerChannelScaleEnabled) {
|
||||
fusion_args.alpha_ptr = tensor_alpha.data().get();
|
||||
fusion_args.beta_ptr = tensor_beta.data().get();
|
||||
}
|
||||
|
||||
if constexpr (IsBiasEnabled) {
|
||||
fusion_args.bias_ptr = tensor_bias.data().get();
|
||||
}
|
||||
@ -478,6 +503,11 @@ struct ConvTestbed {
|
||||
epilogue_fusion_params.alpha = alpha;
|
||||
epilogue_fusion_params.beta = beta;
|
||||
|
||||
if constexpr (IsPerChannelScaleEnabled) {
|
||||
epilogue_fusion_params.tensor_alpha = mAlpha;
|
||||
epilogue_fusion_params.tensor_beta = mBeta;
|
||||
}
|
||||
|
||||
if constexpr (IsBiasEnabled) {
|
||||
epilogue_fusion_params.tensor_bias = mBias;
|
||||
}
|
||||
@ -638,6 +668,16 @@ struct ConvTestbed {
|
||||
for (size_t i = 0; i < size_t(size(B)); ++i) {
|
||||
printf("[%llu]: B = %f\n", static_cast<unsigned long long>(i), float(B(i)));
|
||||
}
|
||||
if constexpr (IsPerChannelScaleEnabled) {
|
||||
for (size_t i = 0; i < size_t(size(tensor_alpha)); ++i) {
|
||||
printf("[%llu]: alpha = %f\n", static_cast<unsigned long long>(i),
|
||||
float(tensor_alpha(i)));
|
||||
}
|
||||
for (size_t i = 0; i < size_t(size(tensor_beta)); ++i) {
|
||||
printf("[%llu]: beta = %f\n", static_cast<unsigned long long>(i),
|
||||
float(tensor_beta(i)));
|
||||
}
|
||||
}
|
||||
if constexpr (IsBiasEnabled) {
|
||||
for (size_t i = 0; i < size_t(size(tensor_bias)); ++i) {
|
||||
printf("[%llu]: bias = %f\n", static_cast<unsigned long long>(i),
|
||||
@ -657,7 +697,9 @@ struct ConvTestbed {
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Conv, bool SupportStrides = (Conv::DispatchPolicy::ConvOp != cutlass::conv::Operator::kDgrad)>
|
||||
bool TestAllConv(double alpha = 1.0, double beta = 0.0, float epsilon = 0.0f
|
||||
bool TestAllConv(double alpha = 1.0, double beta = 0.0, float epsilon = 0.0f,
|
||||
dim3 cluster_shape = dim3(0, 0, 0),
|
||||
dim3 cluster_shape_fallback = dim3(0, 0, 0)
|
||||
) {
|
||||
using ElementScalar = typename Conv::EpilogueOutputOp::ElementScalar;
|
||||
|
||||
@ -697,8 +739,10 @@ bool TestAllConv(double alpha = 1.0, double beta = 0.0, float epsilon = 0.0f
|
||||
passed = testbed.run(
|
||||
conv_problem,
|
||||
cutlass::from_real<ElementScalar>(alpha),
|
||||
cutlass::from_real<ElementScalar>(beta)
|
||||
,RasterOrderOptions::Heuristic, // raster_order
|
||||
cutlass::from_real<ElementScalar>(beta),
|
||||
cluster_shape,
|
||||
cluster_shape_fallback,
|
||||
RasterOrderOptions::Heuristic, // raster_order
|
||||
MaxSwizzleSize(1),
|
||||
splits,
|
||||
decomp_mode
|
||||
|
||||
@ -30,6 +30,8 @@ add_custom_target(
|
||||
cutlass_test_unit_conv_wgrad_device
|
||||
DEPENDS
|
||||
cutlass_test_unit_conv_wgrad_device_tensorop_sm90
|
||||
cutlass_test_unit_conv_wgrad_device_tensorop_sm100
|
||||
cutlass_test_unit_conv_wgrad_device_tensorop_sm100_fusion
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
@ -44,3 +46,26 @@ cutlass_test_unit_add_executable(
|
||||
sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
|
||||
)
|
||||
|
||||
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
|
||||
|
||||
cutlass_test_unit_add_executable(
|
||||
cutlass_test_unit_conv_wgrad_device_tensorop_sm100
|
||||
|
||||
sm100_conv1d_wgrad_implicit_gemm_f16_f16_f16_tensorop_f16.cu
|
||||
sm100_conv2d_wgrad_implicit_gemm_f16_f16_f16_tensorop_f16.cu
|
||||
sm100_conv3d_wgrad_implicit_gemm_f16_f16_f16_tensorop_f16.cu
|
||||
|
||||
sm100_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
|
||||
sm100_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
|
||||
#sm100_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
|
||||
)
|
||||
|
||||
cutlass_test_unit_add_executable_split_file(
|
||||
cutlass_test_unit_conv_wgrad_device_tensorop_sm100_fusion
|
||||
|
||||
sm100_conv1d_wgrad_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
|
||||
sm100_conv2d_wgrad_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
|
||||
sm100_conv3d_wgrad_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
|
||||
)
|
||||
|
||||
endif()
|
||||
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,96 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,96 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,338 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
//TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 128x128x64_1x2x1) {
|
||||
// using ElementAct = cutlass::half_t;
|
||||
// using ElementFlt = cutlass::half_t;
|
||||
// using ElementOut = cutlass::half_t;
|
||||
// using ElementAcc = cutlass::half_t;
|
||||
// using ElementCompute = float;
|
||||
// using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
|
||||
// using ClusterShape = Shape<_1,_2,_1>;
|
||||
//
|
||||
// using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
// MmaTileShape, ClusterShape,
|
||||
// cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
// ElementAcc, ElementCompute,
|
||||
// ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
// ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
// cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
// >::CollectiveOp;
|
||||
//
|
||||
// using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
// cutlass::conv::Operator::kWgrad,
|
||||
// ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
// ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
// ElementAcc,
|
||||
// MmaTileShape, ClusterShape,
|
||||
// cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
// cutlass::conv::collective::KernelScheduleAuto
|
||||
// >::CollectiveOp;
|
||||
//
|
||||
// using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
// using ConvKernel = cutlass::conv::kernel::ConvUniversal< ProblemShape,
|
||||
// ProblemShape,
|
||||
// CollectiveMainloop,
|
||||
// CollectiveEpilogue
|
||||
// >;
|
||||
//
|
||||
// using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
//
|
||||
// EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
//}
|
||||
//
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
//TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 256x64x64_2x1x1) {
|
||||
// using ElementAct = cutlass::half_t;
|
||||
// using ElementFlt = cutlass::half_t;
|
||||
// using ElementOut = cutlass::half_t;
|
||||
// using ElementAcc = cutlass::half_t;
|
||||
// using ElementCompute = float;
|
||||
// using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
|
||||
// using ClusterShape = Shape<_2,_1,_1>;
|
||||
//
|
||||
// using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
// MmaTileShape, ClusterShape,
|
||||
// cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
// ElementAcc, ElementCompute,
|
||||
// ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
// ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
// cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
// >::CollectiveOp;
|
||||
//
|
||||
// using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
// cutlass::conv::Operator::kWgrad,
|
||||
// ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
// ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
// ElementAcc,
|
||||
// MmaTileShape, ClusterShape,
|
||||
// cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
// cutlass::conv::collective::KernelScheduleAuto
|
||||
// >::CollectiveOp;
|
||||
//
|
||||
// using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
// using ConvKernel = cutlass::conv::kernel::ConvUniversal< ProblemShape,
|
||||
// ProblemShape,
|
||||
// CollectiveMainloop,
|
||||
// CollectiveEpilogue
|
||||
// >;
|
||||
//
|
||||
// using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
//
|
||||
// EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
//}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,96 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
// alpha != 1 && beta != 0
|
||||
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = cutlass::half_t;
|
||||
using ElementAcc = cutlass::half_t;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
ProblemShape,
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -0,0 +1,326 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
/*! \file
|
||||
\brief Tests for device-wide CONV interface
|
||||
*/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include "cutlass/cutlass.h"
|
||||
#include "cute/tensor.hpp"
|
||||
#include "cute/atom/mma_atom.hpp"
|
||||
|
||||
#include "cutlass/numeric_types.h"
|
||||
|
||||
#include "cutlass/conv/device/conv_universal_adapter.hpp"
|
||||
#include "cutlass/conv/kernel/conv_universal.hpp"
|
||||
#include "cutlass/conv/collective/collective_builder.hpp"
|
||||
#include "cutlass/epilogue/collective/collective_builder.hpp"
|
||||
|
||||
#include "../testbed_conv.hpp"
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Static cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Cluster tile shape 64x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::TmaWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x64x64
|
||||
// Cluster shape 1x1x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_1,_1,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//
|
||||
// Cluster tile shape 128x128x64
|
||||
// Cluster shape 1x2x1
|
||||
//
|
||||
//TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
|
||||
// using ElementAct = cutlass::half_t;
|
||||
// using ElementFlt = cutlass::half_t;
|
||||
// using ElementOut = float;
|
||||
// using ElementAcc = float;
|
||||
// using ElementCompute = float;
|
||||
// using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
|
||||
// using ClusterShape = Shape<_1,_2,_1>;
|
||||
//
|
||||
// using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
// MmaTileShape, ClusterShape,
|
||||
// cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
// ElementAcc, ElementCompute,
|
||||
// ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
// ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
// cutlass::epilogue::NoSmemWarpSpecialized1Sm
|
||||
// >::CollectiveOp;
|
||||
//
|
||||
// using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
// cutlass::conv::Operator::kWgrad,
|
||||
// ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
// ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
// ElementAcc,
|
||||
// MmaTileShape, ClusterShape,
|
||||
// cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
// cutlass::conv::collective::KernelScheduleAuto
|
||||
// >::CollectiveOp;
|
||||
//
|
||||
// using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
// CollectiveMainloop,
|
||||
// CollectiveEpilogue
|
||||
// >;
|
||||
//
|
||||
// using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
//
|
||||
// EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
//}
|
||||
//
|
||||
//
|
||||
// Cluster tile shape 256x64x64
|
||||
// Cluster shape 2x1x1
|
||||
//
|
||||
//TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
|
||||
// using ElementAct = cutlass::half_t;
|
||||
// using ElementFlt = cutlass::half_t;
|
||||
// using ElementOut = float;
|
||||
// using ElementAcc = float;
|
||||
// using ElementCompute = float;
|
||||
// using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
|
||||
// using ClusterShape = Shape<_2,_1,_1>;
|
||||
//
|
||||
// using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
// MmaTileShape, ClusterShape,
|
||||
// cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
// ElementAcc, ElementCompute,
|
||||
// ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
// ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
// cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
// >::CollectiveOp;
|
||||
//
|
||||
// using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
// cutlass::conv::Operator::kWgrad,
|
||||
// ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
// ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
// ElementAcc,
|
||||
// MmaTileShape, ClusterShape,
|
||||
// cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
// cutlass::conv::collective::KernelScheduleAuto
|
||||
// >::CollectiveOp;
|
||||
//
|
||||
// using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
// CollectiveMainloop,
|
||||
// CollectiveEpilogue
|
||||
// >;
|
||||
//
|
||||
// using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
//
|
||||
// EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
//}
|
||||
|
||||
//
|
||||
// Cluster tile shape 256x128x64
|
||||
// Cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = Shape<_2,_2,_1>;
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::NoSmemWarpSpecialized2Sm
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dynamic cluster
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// CTA tile shape 64x64x64
|
||||
// preferred cluster shape 2x4x1
|
||||
// fallback cluster shape 2x2x1
|
||||
//
|
||||
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
|
||||
using ElementAct = cutlass::half_t;
|
||||
using ElementFlt = cutlass::half_t;
|
||||
using ElementOut = float;
|
||||
using ElementAcc = float;
|
||||
using ElementCompute = float;
|
||||
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
|
||||
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
|
||||
|
||||
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::epilogue::collective::EpilogueTileAuto,
|
||||
ElementAcc, ElementCompute,
|
||||
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
|
||||
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
|
||||
cutlass::epilogue::collective::EpilogueScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
|
||||
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
|
||||
cutlass::conv::Operator::kWgrad,
|
||||
ElementAct, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementFlt, cutlass::layout::TensorNDHWC, 8,
|
||||
ElementAcc,
|
||||
MmaTileShape, ClusterShape,
|
||||
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
|
||||
cutlass::conv::collective::KernelScheduleAuto
|
||||
>::CollectiveOp;
|
||||
|
||||
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
|
||||
CollectiveMainloop,
|
||||
CollectiveEpilogue
|
||||
>;
|
||||
|
||||
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
|
||||
|
||||
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
|
||||
}
|
||||
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
@ -50,6 +50,6 @@ cutlass_test_unit_add_executable(
|
||||
pointer.cpp
|
||||
reverse.cpp
|
||||
swizzle_layout.cpp
|
||||
transform.cpp
|
||||
tensor_algs.cpp
|
||||
tuple.cpp
|
||||
)
|
||||
|
||||
200
test/unit/cute/core/tensor_algs.cpp
Normal file
200
test/unit/cute/core/tensor_algs.cpp
Normal file
@ -0,0 +1,200 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include <cute/algorithm/tensor_algorithms.hpp>
|
||||
#include <cute/algorithm/tensor_reduce.hpp>
|
||||
#include <cute/numeric/complex.hpp>
|
||||
|
||||
TEST(CuTe_algorithm, TensorTransform) {
|
||||
using namespace cute;
|
||||
complex<float> array[4] = {{0,0}, {1,0}, {0,1}, {1,1}};
|
||||
complex<float> correct[4] = {{0,0}, {1,0}, {0,-1}, {1,-1}};
|
||||
Tensor tensor = make_tensor(static_cast<complex<float>*>(array), make_layout(make_shape(4)));
|
||||
conjugate conj;
|
||||
transform(tensor, conj);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
EXPECT_EQ(tensor(i), correct[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(CuTe_algorithm, TensorBatchReduce) {
|
||||
using namespace cute;
|
||||
|
||||
int src_vals[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
|
||||
Tensor src_tensor = make_tensor(static_cast<int*>(src_vals),
|
||||
make_layout(make_shape (make_shape (2,2), make_shape (2,2)),
|
||||
make_stride(make_stride(2,8), make_stride(1,4))));
|
||||
|
||||
array<int, 4> dst_vals;
|
||||
fill(dst_vals, 0);
|
||||
Tensor dst_tensor = make_tensor(dst_vals.begin(), make_shape(2,2));
|
||||
|
||||
batch_reduce(src_tensor, dst_tensor);
|
||||
|
||||
int correct[4] = {20,24,36,40};
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
//printf("%d %d\n", dst_tensor(i), correct[i]);
|
||||
EXPECT_EQ(dst_tensor(i), correct[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TEST(CuTe_algorithm, TensorLogicalReduce) {
|
||||
using namespace cute;
|
||||
|
||||
{ // Reduce each column of a matrix
|
||||
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
|
||||
Layout<Shape <_32, Shape <_12,_6>>,
|
||||
Stride< _1, Stride<_64,_1>>>{});
|
||||
auto slicer = make_coord(0_c, _);
|
||||
Tensor dst_tensor = make_tensor_like(src_tensor(slicer));
|
||||
|
||||
logical_reduce(src_tensor, dst_tensor, slicer);
|
||||
|
||||
for (int i = 0; i < size(dst_tensor); ++i) {
|
||||
EXPECT_EQ(dst_tensor(i), reduce(src_tensor(_,i), int(0)));
|
||||
}
|
||||
}
|
||||
|
||||
{ // Reduce each row of a matrix
|
||||
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
|
||||
Layout<Shape <_32, Shape <_12,_6>>,
|
||||
Stride< _1, Stride<_64,_1>>>{});
|
||||
auto slicer = make_coord(_, 0_c);
|
||||
Tensor dst_tensor = make_tensor_like(src_tensor(slicer));
|
||||
|
||||
logical_reduce(src_tensor, dst_tensor, slicer);
|
||||
|
||||
for (int i = 0; i < size(dst_tensor); ++i) {
|
||||
EXPECT_EQ(dst_tensor(i), reduce(src_tensor(i,_), int(0)));
|
||||
}
|
||||
}
|
||||
|
||||
{ // 1 profile
|
||||
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
|
||||
Layout<Shape<_32>, Stride<_1>>{});
|
||||
array<int, 1> dst_vals;
|
||||
fill(dst_vals, 0);
|
||||
Tensor dst_tensor = make_tensor(dst_vals.begin(), Layout<_1,_0>{});
|
||||
|
||||
logical_reduce(src_tensor, dst_tensor, 1);
|
||||
|
||||
for (int i = 0; i < size(dst_tensor); ++i) {
|
||||
EXPECT_EQ(dst_tensor(i), reduce(src_tensor, int(0)));
|
||||
}
|
||||
}
|
||||
|
||||
{ // _ profile
|
||||
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
|
||||
Layout<Shape<_32>, Stride<_1>>{});
|
||||
auto slicer = _;
|
||||
Tensor dst_tensor = make_tensor_like(src_tensor(slicer));
|
||||
|
||||
logical_reduce(src_tensor, dst_tensor, slicer);
|
||||
|
||||
for (int i = 0; i < size(dst_tensor); ++i) {
|
||||
EXPECT_EQ(dst_tensor(i), src_tensor(i));
|
||||
}
|
||||
}
|
||||
|
||||
{ // (1,1) profile
|
||||
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
|
||||
Layout<Shape <_32, Shape <_12,_6>>,
|
||||
Stride< _1, Stride<_192,_32>>>{});
|
||||
auto slicer = make_coord(1, 1);
|
||||
array<int, 1> dst_vals;
|
||||
fill(dst_vals, 0);
|
||||
Tensor dst_tensor = make_tensor(dst_vals.begin(), Layout<_1,_0>{});
|
||||
|
||||
logical_reduce(src_tensor, dst_tensor, slicer);
|
||||
|
||||
for (int i = 0; i < size(dst_tensor); ++i) {
|
||||
EXPECT_EQ(dst_tensor(i), reduce(src_tensor, int(0)));
|
||||
}
|
||||
}
|
||||
|
||||
{ // (_,_) profile
|
||||
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
|
||||
Layout<Shape <_32, Shape <_12,_6>>,
|
||||
Stride< _1, Stride<_192,_32>>>{});
|
||||
auto slicer = make_coord(_,_);
|
||||
Tensor dst_tensor = make_tensor_like(src_tensor(slicer));
|
||||
|
||||
logical_reduce(src_tensor, dst_tensor, slicer);
|
||||
|
||||
for (int i = 0; i < size(dst_tensor); ++i) {
|
||||
EXPECT_EQ(dst_tensor(i), src_tensor(i));
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
|
||||
make_layout(make_shape (2,2,2,2),
|
||||
make_stride(1,2,4,8)));
|
||||
|
||||
array<int, 4> dst_vals;
|
||||
fill(dst_vals, 0);
|
||||
Tensor dst_tensor = make_tensor(dst_vals.begin(), make_shape(2,2));
|
||||
|
||||
auto target_profile = make_coord(_,1,_,1);
|
||||
|
||||
logical_reduce(src_tensor, dst_tensor, target_profile);
|
||||
|
||||
int correct[4] = {20,24,36,40};
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
//printf("%d %d\n", dst_tensor(i), correct[i]);
|
||||
EXPECT_EQ(dst_tensor(i), correct[i]);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
|
||||
make_layout(make_shape (2,make_shape (2,2),2),
|
||||
make_stride(1,make_stride(2,4),8)));
|
||||
|
||||
array<int, 4> dst_vals;
|
||||
fill(dst_vals, 0);
|
||||
Tensor dst_tensor = make_tensor(dst_vals.begin(), make_shape(2,2));
|
||||
|
||||
auto target_profile = make_coord(_,make_coord(1,_),1);
|
||||
|
||||
logical_reduce(src_tensor, dst_tensor, target_profile);
|
||||
|
||||
int correct[4] = {20,24,36,40};
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
//printf("%d %d\n", dst_tensor(i), correct[i]);
|
||||
EXPECT_EQ(dst_tensor(i), correct[i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,49 +0,0 @@
|
||||
/***************************************************************************************************
|
||||
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: BSD-3-Clause
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its
|
||||
* contributors may be used to endorse or promote products derived from
|
||||
* this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
**************************************************************************************************/
|
||||
|
||||
#include "cutlass_unit_test.h"
|
||||
|
||||
#include <cutlass/trace.h>
|
||||
#include <cute/tensor.hpp>
|
||||
#include <cute/numeric/complex.hpp>
|
||||
|
||||
TEST(CuTe_core, Transform) {
|
||||
using namespace cute;
|
||||
complex<float> array[4] = {{0,0}, {1,0}, {0,1}, {1,1}};
|
||||
complex<float> correct[4] = {{0,0}, {1,0}, {0,-1}, {1,-1}};
|
||||
auto tensor = make_tensor(static_cast<complex<float>*>(array), make_layout(make_shape(4)));
|
||||
conjugate conj;
|
||||
transform(tensor, conj);
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
EXPECT_EQ(tensor(i), correct[i]);
|
||||
}
|
||||
}
|
||||
@ -54,7 +54,7 @@
|
||||
|
||||
using namespace cute;
|
||||
|
||||
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED))
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////////////// 128x64x128 Cluster1x1x1 TMEM 4x1 ////////////////////////////////////////////
|
||||
@ -263,5 +263,5 @@ TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_2cta_s32_ptr_array, 128x1024x128_2x4
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
|
||||
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)
|
||||
|
||||
|
||||
@ -51,6 +51,9 @@ TEST(SM90_nvrtc_kernel, Contraction) {
|
||||
"-std=c++17",
|
||||
"-arch=sm_90",
|
||||
"-I" CUDA_INCLUDE_DIR,
|
||||
#if (__CUDACC_VER_MAJOR__ >= 13)
|
||||
"-I" CUDA_INCLUDE_DIR "/cccl",
|
||||
#endif // __CUDACC_VER_MAJOR__ >= 13
|
||||
};
|
||||
|
||||
EXPECT_TRUE(test::nvrtc::thread::TestbedKernel::compile(
|
||||
@ -60,7 +63,7 @@ TEST(SM90_nvrtc_kernel, Contraction) {
|
||||
"cute::Shape<cute::_1, cute::_2, cute::_1>,"
|
||||
"true, true,"
|
||||
"10, 10, 10, 10>::Kernel",
|
||||
{ nvrtc_opts, nvrtc_opts + 5 }
|
||||
{ std::begin(nvrtc_opts), std::end(nvrtc_opts) }
|
||||
));
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user