v4.0 update. (#2371)

This commit is contained in:
Junkai-Wu
2025-06-06 14:39:20 +08:00
committed by GitHub
parent 2e2af190bd
commit 8bdbfca682
254 changed files with 29751 additions and 1980 deletions

View File

@ -52,7 +52,6 @@ set(header_files_to_check
cute/swizzle_layout.hpp
cute/tensor.hpp
cute/tensor_impl.hpp
cute/tensor_predicate.hpp
cute/underscore.hpp
# cute/algorithm
cute/algorithm/axpby.hpp

View File

@ -30,6 +30,8 @@ add_custom_target(
cutlass_test_unit_conv_dgrad_device
DEPENDS
cutlass_test_unit_conv_dgrad_device_tensorop_sm90
cutlass_test_unit_conv_dgrad_device_tensorop_sm100
cutlass_test_unit_conv_dgrad_device_tensorop_sm100_fusion
)
cutlass_test_unit_add_executable(
@ -47,3 +49,43 @@ cutlass_test_unit_add_executable(
sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
)
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
set(cutlass_test_unit_conv_dgrad_device_tensorop_sm100_kernels
sm100_conv2d_dgrad_implicit_gemm_f8_f8_f8_tensorop_f32.cu
sm100_conv2d_dgrad_implicit_gemm_f8_f8_bf16_tensorop_f32.cu
sm100_conv2d_dgrad_implicit_gemm_f8_f8_f16_tensorop_f32.cu
sm100_conv2d_dgrad_implicit_gemm_f8_f8_f32_tensorop_f32.cu
sm100_conv3d_dgrad_implicit_gemm_f8_f8_f8_tensorop_f32.cu
sm100_conv3d_dgrad_implicit_gemm_f8_f8_bf16_tensorop_f32.cu
sm100_conv3d_dgrad_implicit_gemm_f8_f8_f16_tensorop_f32.cu
sm100_conv3d_dgrad_implicit_gemm_f8_f8_f32_tensorop_f32.cu
sm100_conv1d_dgrad_implicit_gemm_f16_f16_f16_tensorop_f16.cu
sm100_conv2d_dgrad_implicit_gemm_f16_f16_f16_tensorop_f16.cu
sm100_conv3d_dgrad_implicit_gemm_f16_f16_f16_tensorop_f16.cu
sm100_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
sm100_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
sm100_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
)
# Add the executable
cutlass_test_unit_add_executable(
cutlass_test_unit_conv_dgrad_device_tensorop_sm100
${cutlass_test_unit_conv_dgrad_device_tensorop_sm100_kernels}
)
cutlass_test_unit_add_executable(
cutlass_test_unit_conv_dgrad_device_tensorop_sm100_fusion
sm100_conv2d_dgrad_implicit_gemm_f8_f8_f16_tensorop_f32_with_fusion.cu
sm100_conv3d_dgrad_implicit_gemm_f8_f8_f16_tensorop_f32_with_fusion.cu
sm100_conv1d_dgrad_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
sm100_conv2d_dgrad_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
sm100_conv3d_dgrad_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
)
endif()

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,190 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_relu) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementAct>,
ElementOut, cutlass::layout::TensorNWC, 128 / sizeof_bits_v<ElementOut>,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv1d_dgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,190 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_relu) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_bf16nhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::bfloat16_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_bf16nhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::bfloat16_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_bf16nhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::bfloat16_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_bf16nhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::bfloat16_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_bf16nhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::bfloat16_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_bf16nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::bfloat16_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,237 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta_bias) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta_bias_relu) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// per-channel alpha/beta scaling && bias && relu
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f16nhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta_scaled_bias_relu) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::PerColLinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f32nhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f32nhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f32nhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f32nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::float_e4m3_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::float_e4m3_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::float_e4m3_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::float_e4m3_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::float_e4m3_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv2d_dgrad_implicitgemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::float_e4m3_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,143 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_bf16ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::bfloat16_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_bf16ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::bfloat16_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_bf16ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::bfloat16_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_bf16ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::bfloat16_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_bf16ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::bfloat16_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_bf16ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::bfloat16_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,237 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta_bias) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta_bias_relu) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// per-channel alpha/beta scaling && bias && relu
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f16ndhwc_tensor_op_f32, 64x64x64_1x1x1_alpha_beta_scaled_bias_relu) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::half_t;
using ElementAcc = float;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::PerColLinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f32ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f32ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f32ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f8ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::float_e4m3_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f8ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::float_e4m3_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f8ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::float_e4m3_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f8ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::float_e4m3_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f8ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::float_e4m3_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv3d_dgrad_implicitgemm_f8ndhwc_f8ndhwc_f8ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::float_e4m3_t;
using ElementFlt = cutlass::float_e4m3_t;
using ElementOut = cutlass::float_e4m3_t;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kDgrad,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -32,6 +32,8 @@ add_custom_target(
cutlass_test_unit_conv1d_fprop_device_tensorop_sm90
cutlass_test_unit_conv2d_fprop_device_tensorop_sm90
cutlass_test_unit_conv3d_fprop_device_tensorop_sm90
cutlass_test_unit_conv_fprop_device_tensorop_sm100
cutlass_test_unit_conv_fprop_device_tensorop_sm100_fusion
)
cutlass_test_unit_add_executable(
@ -73,3 +75,50 @@ cutlass_test_unit_add_executable(
sm90_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
)
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
cutlass_test_unit_add_executable(
cutlass_test_unit_conv_fprop_device_tensorop_sm100
# No batching of source to control compiler memory usage
BATCH_SOURCES ON
BATCH_SIZE 1
sm100_conv1d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
sm100_conv2d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
sm100_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
sm100_conv1d_fprop_implicit_gemm_f16_f16_f16_tensorop_f16.cu
sm100_conv2d_fprop_implicit_gemm_f16_f16_f16_tensorop_f16.cu
sm100_conv3d_fprop_implicit_gemm_f16_f16_f16_tensorop_f16.cu
sm100_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
sm100_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
sm100_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
sm100_conv1d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
sm100_conv2d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
sm100_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
)
cutlass_test_unit_add_executable(
cutlass_test_unit_conv_fprop_device_tensorop_sm100_fusion
# No batching of source to control compiler memory usage
BATCH_SOURCES ON
BATCH_SIZE 1
sm100_conv1d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32_with_fusion.cu
sm100_conv2d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32_with_fusion.cu
sm100_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32_with_fusion.cu
sm100_conv1d_fprop_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
sm100_conv2d_fprop_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
sm100_conv3d_fprop_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
sm100_conv1d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32_with_fusion.cu
sm100_conv2d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32_with_fusion.cu
sm100_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32_with_fusion.cu
)
endif()

View File

@ -0,0 +1,246 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,236 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_relu) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && gelu
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_gelu) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::GELU_taylor, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,292 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,339 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED))
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 128x64x64_1x1x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 128x128x64_1x2x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 256x64x64_2x1x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 256x128x64_2x2x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)

View File

@ -0,0 +1,378 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED))
// alpha != 1 && beta != 0
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_relu) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// per-channel alpha/beta scaling && bias && relu
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_scaled_bias_relu) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::PerColLinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
// alpha != 1 && beta != 0 && bias && gelu
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_gelu) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::GELU_taylor, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.05f));
}
// alpha != 1 && beta != 0 && bias && gelu_erf
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_gelu_erf) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::GELU, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
// alpha != 1 && beta != 0 && bias && swish
TEST(SM100_device_conv1d_fprop_implicitgemm_s8nwc_s8nwc_s32nwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_swish) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::SiLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x32
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x32_1x1x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNWC, 4,
float, cutlass::layout::TensorNWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 4,
ElementFlt, cutlass::layout::TensorNWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x32
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 128x64x32_1x1x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNWC, 4,
float, cutlass::layout::TensorNWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 4,
ElementFlt, cutlass::layout::TensorNWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x32
// Cluster shape 1x2x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 128x128x32_1x2x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNWC, 4,
float, cutlass::layout::TensorNWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 4,
ElementFlt, cutlass::layout::TensorNWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x32
// Cluster shape 2x1x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 256x64x32_2x1x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_32>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNWC, 4,
float, cutlass::layout::TensorNWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 4,
ElementFlt, cutlass::layout::TensorNWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x32
// Cluster shape 2x2x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 256x128x32_2x2x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_32>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNWC, 4,
float, cutlass::layout::TensorNWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 4,
ElementFlt, cutlass::layout::TensorNWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x32
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,190 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNWC, 4,
float, cutlass::layout::TensorNWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 4,
ElementFlt, cutlass::layout::TensorNWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta_bias) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNWC, 4,
float, cutlass::layout::TensorNWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 4,
ElementFlt, cutlass::layout::TensorNWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv1d_fprop_implicitgemm_tf32nwc_tf32nwc_f32nwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta_bias_relu) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNWC, 4,
float, cutlass::layout::TensorNWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNWC, 4,
ElementFlt, cutlass::layout::TensorNWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,237 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_relu) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && gelu
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_gelu) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::GELU_taylor, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,339 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED))
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 128x64x64_1x1x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 128x128x64_1x2x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 256x64x64_2x1x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 256x128x64_2x2x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)

View File

@ -0,0 +1,378 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED))
// alpha != 1 && beta != 0
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_relu) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// per-channel alpha/beta scaling && bias && relu
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_scaled_bias_relu) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::PerColLinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
// alpha != 1 && beta != 0 && bias && gelu
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_gelu) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::GELU_taylor, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
// alpha != 1 && beta != 0 && bias && gelu_erf
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_gelu_erf) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::GELU, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
// alpha != 1 && beta != 0 && bias && swish
TEST(SM100_device_conv2d_fprop_implicitgemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_swish) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::SiLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x32
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x32_1x1x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNHWC, 4,
float, cutlass::layout::TensorNHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 4,
ElementFlt, cutlass::layout::TensorNHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x32
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 128x64x32_1x1x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNHWC, 4,
float, cutlass::layout::TensorNHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 4,
ElementFlt, cutlass::layout::TensorNHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x32
// Cluster shape 1x2x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 128x128x32_1x2x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNHWC, 4,
float, cutlass::layout::TensorNHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 4,
ElementFlt, cutlass::layout::TensorNHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x32
// Cluster shape 2x1x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 256x64x32_2x1x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_32>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNHWC, 4,
float, cutlass::layout::TensorNHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 4,
ElementFlt, cutlass::layout::TensorNHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x32
// Cluster shape 2x2x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 256x128x32_2x2x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_32>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNHWC, 4,
float, cutlass::layout::TensorNHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 4,
ElementFlt, cutlass::layout::TensorNHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x32
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,190 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNHWC, 4,
float, cutlass::layout::TensorNHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 4,
ElementFlt, cutlass::layout::TensorNHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta_bias) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNHWC, 4,
float, cutlass::layout::TensorNHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 4,
ElementFlt, cutlass::layout::TensorNHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv2d_fprop_implicitgemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta_bias_relu) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNHWC, 4,
float, cutlass::layout::TensorNHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNHWC, 4,
ElementFlt, cutlass::layout::TensorNHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,331 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_relu) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && gelu
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_gelu) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::GELU_taylor, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
// alpha != 1 && beta != 0 && bias && HardSwish
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_hardswish) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ScaledHardSwish, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
// alpha != 1 && beta != 0 && bias && leakyrelu
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta_bias_leakyrelu) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using ElementBias = cutlass::half_t;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::LeakyReLU, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 128x64x64_1x1x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 128x128x64_1x2x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 256x64x64_2x1x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 256x128x64_2x2x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)

View File

@ -0,0 +1,473 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED))
// alpha != 1 && beta != 0
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_relu) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// per-channel alpha/beta scaling && bias && relu
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_scaled_bias_relu) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::PerColLinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
// alpha != 1 && beta != 0 && bias && gelu
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_gelu) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::GELU_taylor, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
// alpha != 1 && beta != 0 && bias && gelu_erf
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_gelu_erf) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::GELU, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
// alpha != 1 && beta != 0 && bias && swish
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_swish) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::SiLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
// alpha != 1 && beta != 0 && bias && leakyrelu
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_leakyrelu) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::LeakyReLU, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
// alpha != 1 && beta != 0 && bias && hardswish
TEST(SM100_device_conv3d_fprop_implicitgemm_s8ndhwc_s8ndhwc_s32ndhwc_tensor_op_s32, 64x64x64_1x1x1_alpha_beta_bias_hardswish) {
using ElementAct = int8_t;
using ElementFlt = int8_t;
using ElementOut = int32_t;
using ElementAcc = int32_t;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ScaledHardSwish, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
int8_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int8_t>::value,
int32_t, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<int32_t>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0f, 1.0f, 0.005f));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x32
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x32_1x1x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNDHWC, 4,
float, cutlass::layout::TensorNDHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 4,
ElementFlt, cutlass::layout::TensorNDHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x32
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 128x64x32_1x1x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNDHWC, 4,
float, cutlass::layout::TensorNDHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 4,
ElementFlt, cutlass::layout::TensorNDHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x32
// Cluster shape 1x2x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 128x128x32_1x2x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNDHWC, 4,
float, cutlass::layout::TensorNDHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 4,
ElementFlt, cutlass::layout::TensorNDHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x32
// Cluster shape 2x1x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 256x64x32_2x1x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_32>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNDHWC, 4,
float, cutlass::layout::TensorNDHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 4,
ElementFlt, cutlass::layout::TensorNDHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x32
// Cluster shape 2x2x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 256x128x32_2x2x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, _64, Shape<_32>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNDHWC, 4,
float, cutlass::layout::TensorNDHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 4,
ElementFlt, cutlass::layout::TensorNDHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x32
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorNDHWC, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementAct),
ElementFlt, cutlass::layout::TensorNDHWC, 16 / sizeof(ElementFlt),
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,190 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNDHWC, 4,
float, cutlass::layout::TensorNDHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 4,
ElementFlt, cutlass::layout::TensorNDHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta_bias) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBias<
ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNDHWC, 4,
float, cutlass::layout::TensorNDHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 4,
ElementFlt, cutlass::layout::TensorNDHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
// alpha != 1 && beta != 0 && bias && relu
TEST(SM100_device_conv3d_fprop_implicitgemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32, 64x64x32_1x1x1_alpha_beta_bias_relu) {
using ElementAct = float;
using ElementFlt = float;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using ElementBias = float;
using MmaTileShape = Shape<_64, _64, Shape<_32>>;
using ClusterShape = Shape<_1,_1,_1>;
using FusionOperation = cutlass::epilogue::fusion::LinCombPerColBiasEltAct<
cutlass::epilogue::thread::ReLu, ElementOut, ElementCompute, ElementBias>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
float, cutlass::layout::TensorNDHWC, 4,
float, cutlass::layout::TensorNDHWC, 4,
cutlass::epilogue::collective::EpilogueScheduleAuto,
FusionOperation
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kFprop,
ElementAct, cutlass::layout::TensorNDHWC, 4,
ElementFlt, cutlass::layout::TensorNDHWC, 4,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -172,6 +172,8 @@ struct ConvTestbed {
static constexpr bool IsBiasEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithBias<FusionOp>::value &&
!cute::is_same_v<BiasType, void>;
static constexpr bool IsPerChannelScaleEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithPerChannelScaled<FusionOp>::value;
static constexpr bool DisableSource = cute::is_void_v<typename FusionOp::ElementSource>;
using StrideC = typename Conv::ConvKernel::StrideC;
@ -213,10 +215,24 @@ struct ConvTestbed {
tensor_D_computed.resize(sizeof(ElementD) * problem_shape.size_C());
tensor_D_reference.resize(sizeof(ElementD) * problem_shape.size_C());
tensor_bias.resize(sizeof(ElementBias) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
if constexpr (IsPerChannelScaleEnabled) {
tensor_alpha.resize(sizeof(ElementScalar) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
tensor_beta.resize(sizeof(ElementScalar) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
}
initialize_values(tensor_A, init_A, seed);
initialize_values(tensor_B, init_B, seed * 11);
initialize_values(tensor_C, init_C, seed * 17);
initialize_values(tensor_bias, init_bias, seed * 19);
if constexpr (IsPerChannelScaleEnabled) {
initialize_values(tensor_alpha, init_bias, seed * 23);
if constexpr (DisableSource) {
initialize_values(tensor_beta, init_disable, seed * 27);
}
else {
initialize_values(tensor_beta, init_bias, seed * 27);
}
}
bool flag = true;
if constexpr (isSparseEnabled) {
flag &= params.initialize(problem_shape, tensor_B, static_cast<int>(seed + 2023));
@ -314,8 +330,9 @@ struct ConvTestbed {
bool run(
ProblemShape const& problem_shape,
ElementScalar alpha = ElementScalar(1),
ElementScalar beta = ElementScalar(0)
,
ElementScalar beta = ElementScalar(0),
dim3 cluster_shape = dim3(0, 0, 0),
dim3 cluster_shape_fallback = dim3(0, 0, 0),
RasterOrderOptions raster_order = RasterOrderOptions::Heuristic,
MaxSwizzleSize max_swizzle = MaxSwizzleSize{},
Splits splits = Splits{},
@ -341,6 +358,9 @@ struct ConvTestbed {
cudaGetDevice(&hw_info.device_id);
hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
hw_info.cluster_shape = cluster_shape;
hw_info.cluster_shape_fallback = cluster_shape_fallback;
// configure the operator
Conv conv_op;
auto stride_C = StrideC{};
@ -392,6 +412,11 @@ struct ConvTestbed {
fusion_args.alpha = alpha;
fusion_args.beta = beta;
if constexpr (IsPerChannelScaleEnabled) {
fusion_args.alpha_ptr = tensor_alpha.data().get();
fusion_args.beta_ptr = tensor_beta.data().get();
}
if constexpr (IsBiasEnabled) {
fusion_args.bias_ptr = tensor_bias.data().get();
}
@ -478,6 +503,11 @@ struct ConvTestbed {
epilogue_fusion_params.alpha = alpha;
epilogue_fusion_params.beta = beta;
if constexpr (IsPerChannelScaleEnabled) {
epilogue_fusion_params.tensor_alpha = mAlpha;
epilogue_fusion_params.tensor_beta = mBeta;
}
if constexpr (IsBiasEnabled) {
epilogue_fusion_params.tensor_bias = mBias;
}
@ -638,6 +668,16 @@ struct ConvTestbed {
for (size_t i = 0; i < size_t(size(B)); ++i) {
printf("[%llu]: B = %f\n", static_cast<unsigned long long>(i), float(B(i)));
}
if constexpr (IsPerChannelScaleEnabled) {
for (size_t i = 0; i < size_t(size(tensor_alpha)); ++i) {
printf("[%llu]: alpha = %f\n", static_cast<unsigned long long>(i),
float(tensor_alpha(i)));
}
for (size_t i = 0; i < size_t(size(tensor_beta)); ++i) {
printf("[%llu]: beta = %f\n", static_cast<unsigned long long>(i),
float(tensor_beta(i)));
}
}
if constexpr (IsBiasEnabled) {
for (size_t i = 0; i < size_t(size(tensor_bias)); ++i) {
printf("[%llu]: bias = %f\n", static_cast<unsigned long long>(i),
@ -657,7 +697,9 @@ struct ConvTestbed {
/////////////////////////////////////////////////////////////////////////////////////////////////
template <typename Conv, bool SupportStrides = (Conv::DispatchPolicy::ConvOp != cutlass::conv::Operator::kDgrad)>
bool TestAllConv(double alpha = 1.0, double beta = 0.0, float epsilon = 0.0f
bool TestAllConv(double alpha = 1.0, double beta = 0.0, float epsilon = 0.0f,
dim3 cluster_shape = dim3(0, 0, 0),
dim3 cluster_shape_fallback = dim3(0, 0, 0)
) {
using ElementScalar = typename Conv::EpilogueOutputOp::ElementScalar;
@ -697,8 +739,10 @@ bool TestAllConv(double alpha = 1.0, double beta = 0.0, float epsilon = 0.0f
passed = testbed.run(
conv_problem,
cutlass::from_real<ElementScalar>(alpha),
cutlass::from_real<ElementScalar>(beta)
,RasterOrderOptions::Heuristic, // raster_order
cutlass::from_real<ElementScalar>(beta),
cluster_shape,
cluster_shape_fallback,
RasterOrderOptions::Heuristic, // raster_order
MaxSwizzleSize(1),
splits,
decomp_mode

View File

@ -30,6 +30,8 @@ add_custom_target(
cutlass_test_unit_conv_wgrad_device
DEPENDS
cutlass_test_unit_conv_wgrad_device_tensorop_sm90
cutlass_test_unit_conv_wgrad_device_tensorop_sm100
cutlass_test_unit_conv_wgrad_device_tensorop_sm100_fusion
)
cutlass_test_unit_add_executable(
@ -44,3 +46,26 @@ cutlass_test_unit_add_executable(
sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
)
if (CUTLASS_NVCC_ARCHS MATCHES 100a)
cutlass_test_unit_add_executable(
cutlass_test_unit_conv_wgrad_device_tensorop_sm100
sm100_conv1d_wgrad_implicit_gemm_f16_f16_f16_tensorop_f16.cu
sm100_conv2d_wgrad_implicit_gemm_f16_f16_f16_tensorop_f16.cu
sm100_conv3d_wgrad_implicit_gemm_f16_f16_f16_tensorop_f16.cu
sm100_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
sm100_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
#sm100_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
)
cutlass_test_unit_add_executable_split_file(
cutlass_test_unit_conv_wgrad_device_tensorop_sm100_fusion
sm100_conv1d_wgrad_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
sm100_conv2d_wgrad_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
sm100_conv3d_wgrad_implicit_gemm_f16_f16_f16_tensorop_f16_with_fusion.cu
)
endif()

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::TmaWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized2Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized2Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,96 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f16nwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::TmaWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized2Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized2Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv1d_wgrad_implicitgemm_f16nwc_f16nwc_f32nwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCS, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNWC, 8,
ElementFlt, cutlass::layout::TensorNWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::TmaWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized2Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized2Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,96 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 128x128x64_1x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::TmaWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 256x64x64_2x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_2,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized2Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized2Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv2d_wgrad_implicitgemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSR, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNHWC, 8,
ElementFlt, cutlass::layout::TensorNHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,338 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::TmaWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
//TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 128x128x64_1x2x1) {
// using ElementAct = cutlass::half_t;
// using ElementFlt = cutlass::half_t;
// using ElementOut = cutlass::half_t;
// using ElementAcc = cutlass::half_t;
// using ElementCompute = float;
// using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
// using ClusterShape = Shape<_1,_2,_1>;
//
// using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
// MmaTileShape, ClusterShape,
// cutlass::epilogue::collective::EpilogueTileAuto,
// ElementAcc, ElementCompute,
// ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
// ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
// cutlass::epilogue::NoSmemWarpSpecialized1Sm
// >::CollectiveOp;
//
// using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
// cutlass::conv::Operator::kWgrad,
// ElementAct, cutlass::layout::TensorNDHWC, 8,
// ElementFlt, cutlass::layout::TensorNDHWC, 8,
// ElementAcc,
// MmaTileShape, ClusterShape,
// cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
// cutlass::conv::collective::KernelScheduleAuto
// >::CollectiveOp;
//
// using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
// using ConvKernel = cutlass::conv::kernel::ConvUniversal< ProblemShape,
// ProblemShape,
// CollectiveMainloop,
// CollectiveEpilogue
// >;
//
// using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
//
// EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
//}
//
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
//TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 256x64x64_2x1x1) {
// using ElementAct = cutlass::half_t;
// using ElementFlt = cutlass::half_t;
// using ElementOut = cutlass::half_t;
// using ElementAcc = cutlass::half_t;
// using ElementCompute = float;
// using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
// using ClusterShape = Shape<_2,_1,_1>;
//
// using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
// MmaTileShape, ClusterShape,
// cutlass::epilogue::collective::EpilogueTileAuto,
// ElementAcc, ElementCompute,
// ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
// ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
// cutlass::epilogue::NoSmemWarpSpecialized2Sm
// >::CollectiveOp;
//
// using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
// cutlass::conv::Operator::kWgrad,
// ElementAct, cutlass::layout::TensorNDHWC, 8,
// ElementFlt, cutlass::layout::TensorNDHWC, 8,
// ElementAcc,
// MmaTileShape, ClusterShape,
// cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
// cutlass::conv::collective::KernelScheduleAuto
// >::CollectiveOp;
//
// using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
// using ConvKernel = cutlass::conv::kernel::ConvUniversal< ProblemShape,
// ProblemShape,
// CollectiveMainloop,
// CollectiveEpilogue
// >;
//
// using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
//
// EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
//}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized2Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,96 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
// alpha != 1 && beta != 0
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f16ndhwc_tensor_op_f16, 64x64x64_1x1x1_alpha_beta) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = cutlass::half_t;
using ElementAcc = cutlass::half_t;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ProblemShape=cutlass::conv::ConvProblemShape<CollectiveMainloop::DispatchPolicy::ConvOp, CollectiveMainloop::DispatchPolicy::NumSpatialDimensions>;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
ProblemShape,
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(2.0, 1.0));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -0,0 +1,326 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
/*! \file
\brief Tests for device-wide CONV interface
*/
#include "cutlass_unit_test.h"
#include "cutlass/cutlass.h"
#include "cute/tensor.hpp"
#include "cute/atom/mma_atom.hpp"
#include "cutlass/numeric_types.h"
#include "cutlass/conv/device/conv_universal_adapter.hpp"
#include "cutlass/conv/kernel/conv_universal.hpp"
#include "cutlass/conv/collective/collective_builder.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "../testbed_conv.hpp"
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
//////////////////////////////////////////////////////////////////////////////////////////////////
// Static cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// Cluster tile shape 64x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::TmaWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x64x64
// Cluster shape 1x1x1
//
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x64x64_1x1x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_1,_1,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized1Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//
// Cluster tile shape 128x128x64
// Cluster shape 1x2x1
//
//TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 128x128x64_1x2x1) {
// using ElementAct = cutlass::half_t;
// using ElementFlt = cutlass::half_t;
// using ElementOut = float;
// using ElementAcc = float;
// using ElementCompute = float;
// using MmaTileShape = Shape<_128, Shape<_64>, Shape<_64>>;
// using ClusterShape = Shape<_1,_2,_1>;
//
// using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
// MmaTileShape, ClusterShape,
// cutlass::epilogue::collective::EpilogueTileAuto,
// ElementAcc, ElementCompute,
// ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
// ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
// cutlass::epilogue::NoSmemWarpSpecialized1Sm
// >::CollectiveOp;
//
// using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
// cutlass::conv::Operator::kWgrad,
// ElementAct, cutlass::layout::TensorNDHWC, 8,
// ElementFlt, cutlass::layout::TensorNDHWC, 8,
// ElementAcc,
// MmaTileShape, ClusterShape,
// cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
// cutlass::conv::collective::KernelScheduleAuto
// >::CollectiveOp;
//
// using ConvKernel = cutlass::conv::kernel::ConvUniversal<
// CollectiveMainloop,
// CollectiveEpilogue
// >;
//
// using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
//
// EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
//}
//
//
// Cluster tile shape 256x64x64
// Cluster shape 2x1x1
//
//TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 256x64x64_2x1x1) {
// using ElementAct = cutlass::half_t;
// using ElementFlt = cutlass::half_t;
// using ElementOut = float;
// using ElementAcc = float;
// using ElementCompute = float;
// using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
// using ClusterShape = Shape<_2,_1,_1>;
//
// using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
// MmaTileShape, ClusterShape,
// cutlass::epilogue::collective::EpilogueTileAuto,
// ElementAcc, ElementCompute,
// ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
// ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
// cutlass::epilogue::NoSmemWarpSpecialized2Sm
// >::CollectiveOp;
//
// using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
// cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
// cutlass::conv::Operator::kWgrad,
// ElementAct, cutlass::layout::TensorNDHWC, 8,
// ElementFlt, cutlass::layout::TensorNDHWC, 8,
// ElementAcc,
// MmaTileShape, ClusterShape,
// cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
// cutlass::conv::collective::KernelScheduleAuto
// >::CollectiveOp;
//
// using ConvKernel = cutlass::conv::kernel::ConvUniversal<
// CollectiveMainloop,
// CollectiveEpilogue
// >;
//
// using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
//
// EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
//}
//
// Cluster tile shape 256x128x64
// Cluster shape 2x2x1
//
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 256x128x64_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_256, Shape<_64>, Shape<_64>>;
using ClusterShape = Shape<_2,_2,_1>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::NoSmemWarpSpecialized2Sm
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>());
}
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dynamic cluster
//////////////////////////////////////////////////////////////////////////////////////////////////
//
// CTA tile shape 64x64x64
// preferred cluster shape 2x4x1
// fallback cluster shape 2x2x1
//
TEST(SM100_device_conv3d_wgrad_implicitgemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32, 64x64x64_preferred_2x4x1_fallback_2x2x1) {
using ElementAct = cutlass::half_t;
using ElementFlt = cutlass::half_t;
using ElementOut = float;
using ElementAcc = float;
using ElementCompute = float;
using MmaTileShape = Shape<_64, Shape<_64>, Shape<_64>>;
using ClusterShape = decltype(make_shape(int(0), int(0), Int<1>{}));
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
MmaTileShape, ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAcc, ElementCompute,
ElementAct, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementAct>::value,
ElementOut, cutlass::layout::TensorKCSRT, 128 / cutlass::sizeof_bits<ElementOut>::value,
cutlass::epilogue::collective::EpilogueScheduleAuto
>::CollectiveOp;
using CollectiveMainloop = typename cutlass::conv::collective::CollectiveBuilder<
cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp,
cutlass::conv::Operator::kWgrad,
ElementAct, cutlass::layout::TensorNDHWC, 8,
ElementFlt, cutlass::layout::TensorNDHWC, 8,
ElementAcc,
MmaTileShape, ClusterShape,
cutlass::conv::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
cutlass::conv::collective::KernelScheduleAuto
>::CollectiveOp;
using ConvKernel = cutlass::conv::kernel::ConvUniversal<
CollectiveMainloop,
CollectiveEpilogue
>;
using Conv = cutlass::conv::device::ConvUniversalAdapter<ConvKernel>;
EXPECT_TRUE(test::conv::device::TestAllConv<Conv>(1.0, 0.0, 0.0f, dim3(2,4,1), dim3(2,2,1)));
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

View File

@ -50,6 +50,6 @@ cutlass_test_unit_add_executable(
pointer.cpp
reverse.cpp
swizzle_layout.cpp
transform.cpp
tensor_algs.cpp
tuple.cpp
)

View File

@ -0,0 +1,200 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include "cutlass_unit_test.h"
#include <cute/algorithm/tensor_algorithms.hpp>
#include <cute/algorithm/tensor_reduce.hpp>
#include <cute/numeric/complex.hpp>
TEST(CuTe_algorithm, TensorTransform) {
using namespace cute;
complex<float> array[4] = {{0,0}, {1,0}, {0,1}, {1,1}};
complex<float> correct[4] = {{0,0}, {1,0}, {0,-1}, {1,-1}};
Tensor tensor = make_tensor(static_cast<complex<float>*>(array), make_layout(make_shape(4)));
conjugate conj;
transform(tensor, conj);
for (int i = 0; i < 4; ++i) {
EXPECT_EQ(tensor(i), correct[i]);
}
}
TEST(CuTe_algorithm, TensorBatchReduce) {
using namespace cute;
int src_vals[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
Tensor src_tensor = make_tensor(static_cast<int*>(src_vals),
make_layout(make_shape (make_shape (2,2), make_shape (2,2)),
make_stride(make_stride(2,8), make_stride(1,4))));
array<int, 4> dst_vals;
fill(dst_vals, 0);
Tensor dst_tensor = make_tensor(dst_vals.begin(), make_shape(2,2));
batch_reduce(src_tensor, dst_tensor);
int correct[4] = {20,24,36,40};
for (int i = 0; i < 4; ++i) {
//printf("%d %d\n", dst_tensor(i), correct[i]);
EXPECT_EQ(dst_tensor(i), correct[i]);
}
}
TEST(CuTe_algorithm, TensorLogicalReduce) {
using namespace cute;
{ // Reduce each column of a matrix
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
Layout<Shape <_32, Shape <_12,_6>>,
Stride< _1, Stride<_64,_1>>>{});
auto slicer = make_coord(0_c, _);
Tensor dst_tensor = make_tensor_like(src_tensor(slicer));
logical_reduce(src_tensor, dst_tensor, slicer);
for (int i = 0; i < size(dst_tensor); ++i) {
EXPECT_EQ(dst_tensor(i), reduce(src_tensor(_,i), int(0)));
}
}
{ // Reduce each row of a matrix
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
Layout<Shape <_32, Shape <_12,_6>>,
Stride< _1, Stride<_64,_1>>>{});
auto slicer = make_coord(_, 0_c);
Tensor dst_tensor = make_tensor_like(src_tensor(slicer));
logical_reduce(src_tensor, dst_tensor, slicer);
for (int i = 0; i < size(dst_tensor); ++i) {
EXPECT_EQ(dst_tensor(i), reduce(src_tensor(i,_), int(0)));
}
}
{ // 1 profile
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
Layout<Shape<_32>, Stride<_1>>{});
array<int, 1> dst_vals;
fill(dst_vals, 0);
Tensor dst_tensor = make_tensor(dst_vals.begin(), Layout<_1,_0>{});
logical_reduce(src_tensor, dst_tensor, 1);
for (int i = 0; i < size(dst_tensor); ++i) {
EXPECT_EQ(dst_tensor(i), reduce(src_tensor, int(0)));
}
}
{ // _ profile
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
Layout<Shape<_32>, Stride<_1>>{});
auto slicer = _;
Tensor dst_tensor = make_tensor_like(src_tensor(slicer));
logical_reduce(src_tensor, dst_tensor, slicer);
for (int i = 0; i < size(dst_tensor); ++i) {
EXPECT_EQ(dst_tensor(i), src_tensor(i));
}
}
{ // (1,1) profile
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
Layout<Shape <_32, Shape <_12,_6>>,
Stride< _1, Stride<_192,_32>>>{});
auto slicer = make_coord(1, 1);
array<int, 1> dst_vals;
fill(dst_vals, 0);
Tensor dst_tensor = make_tensor(dst_vals.begin(), Layout<_1,_0>{});
logical_reduce(src_tensor, dst_tensor, slicer);
for (int i = 0; i < size(dst_tensor); ++i) {
EXPECT_EQ(dst_tensor(i), reduce(src_tensor, int(0)));
}
}
{ // (_,_) profile
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
Layout<Shape <_32, Shape <_12,_6>>,
Stride< _1, Stride<_192,_32>>>{});
auto slicer = make_coord(_,_);
Tensor dst_tensor = make_tensor_like(src_tensor(slicer));
logical_reduce(src_tensor, dst_tensor, slicer);
for (int i = 0; i < size(dst_tensor); ++i) {
EXPECT_EQ(dst_tensor(i), src_tensor(i));
}
}
{
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
make_layout(make_shape (2,2,2,2),
make_stride(1,2,4,8)));
array<int, 4> dst_vals;
fill(dst_vals, 0);
Tensor dst_tensor = make_tensor(dst_vals.begin(), make_shape(2,2));
auto target_profile = make_coord(_,1,_,1);
logical_reduce(src_tensor, dst_tensor, target_profile);
int correct[4] = {20,24,36,40};
for (int i = 0; i < 4; ++i) {
//printf("%d %d\n", dst_tensor(i), correct[i]);
EXPECT_EQ(dst_tensor(i), correct[i]);
}
}
{
Tensor src_tensor = make_tensor(counting_iterator<int>{0},
make_layout(make_shape (2,make_shape (2,2),2),
make_stride(1,make_stride(2,4),8)));
array<int, 4> dst_vals;
fill(dst_vals, 0);
Tensor dst_tensor = make_tensor(dst_vals.begin(), make_shape(2,2));
auto target_profile = make_coord(_,make_coord(1,_),1);
logical_reduce(src_tensor, dst_tensor, target_profile);
int correct[4] = {20,24,36,40};
for (int i = 0; i < 4; ++i) {
//printf("%d %d\n", dst_tensor(i), correct[i]);
EXPECT_EQ(dst_tensor(i), correct[i]);
}
}
}

View File

@ -1,49 +0,0 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************************************/
#include "cutlass_unit_test.h"
#include <cutlass/trace.h>
#include <cute/tensor.hpp>
#include <cute/numeric/complex.hpp>
TEST(CuTe_core, Transform) {
using namespace cute;
complex<float> array[4] = {{0,0}, {1,0}, {0,1}, {1,1}};
complex<float> correct[4] = {{0,0}, {1,0}, {0,-1}, {1,-1}};
auto tensor = make_tensor(static_cast<complex<float>*>(array), make_layout(make_shape(4)));
conjugate conj;
transform(tensor, conj);
for (int i = 0; i < 4; ++i)
{
EXPECT_EQ(tensor(i), correct[i]);
}
}

View File

@ -54,7 +54,7 @@
using namespace cute;
#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED))
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////// 128x64x128 Cluster1x1x1 TMEM 4x1 ////////////////////////////////////////////
@ -263,5 +263,5 @@ TEST(SM100_Device_Gemm_s8t_s8n_s8n_tensorop_2cta_s32_ptr_array, 128x1024x128_2x4
EXPECT_TRUE(pass);
}
#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && !defined(CUTLASS_SM100_FAMILY_ARCHS_ENABLED)

View File

@ -51,6 +51,9 @@ TEST(SM90_nvrtc_kernel, Contraction) {
"-std=c++17",
"-arch=sm_90",
"-I" CUDA_INCLUDE_DIR,
#if (__CUDACC_VER_MAJOR__ >= 13)
"-I" CUDA_INCLUDE_DIR "/cccl",
#endif // __CUDACC_VER_MAJOR__ >= 13
};
EXPECT_TRUE(test::nvrtc::thread::TestbedKernel::compile(
@ -60,7 +63,7 @@ TEST(SM90_nvrtc_kernel, Contraction) {
"cute::Shape<cute::_1, cute::_2, cute::_1>,"
"true, true,"
"10, 10, 10, 10>::Kernel",
{ nvrtc_opts, nvrtc_opts + 5 }
{ std::begin(nvrtc_opts), std::end(nvrtc_opts) }
));
}
#endif